diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1802e4a46d5bd..265ddc9504167 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,6 +168,9 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
+# lite subgraph compilation depends on CUDNN_ROOT,
+# so include(cudnn) needs to be in front of include(third_party/lite)
+include(cudnn)              # set cudnn libraries, must before configure
 include(third_party)        # download, build, install third_party
 
 if(WITH_DISTRIBUTE)
@@ -187,7 +190,6 @@ if(NOT WIN32)
 endif()
 
 include(flags)              # set paddle compile flags
-include(cudnn)              # set cudnn libraries, must before configure
 
 if(WITH_GPU)
     include(cuda)
@@ -216,6 +218,9 @@ endif(WITH_AMD_GPU)
 if(WITH_ARM)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON" FORCE)
+    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
+    set(WITH_GPU OFF CACHE STRING "Disable GPU when compiling WITH_ARM=ON." FORCE)
     add_definitions(-DPADDLE_WITH_ARM)
 endif()
 
diff --git a/README.md b/README.md
index 1805faeb11f03..b07709facd528 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,9 @@
 ﻿ 
-# PaddlePaddle
+<p align="center">
+<img align="center" src="doc/imgs/logo.png", width=1600>
+<p>
+    
+--------------------------------------------------------------------------------
 
 English | [简体中文](./README_cn.md)
 
@@ -29,7 +33,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.2.post97
+pip install paddlepaddle-gpu==1.8.3.post97
 
 ```
 It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.
diff --git a/README_cn.md b/README_cn.md
index dccd4f227b8d1..93ad06d20010f 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -1,5 +1,9 @@
 ﻿
-# PaddlePaddle
+<p align="center">
+<img align="center" src="doc/imgs/logo.png", width=1600>
+<p>
+
+--------------------------------------------------------------------------------
 
 [English](./README.md) | 简体中文
 
@@ -26,7 +30,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.2.post97
+pip install paddlepaddle-gpu==1.8.3.post97
 
 ```
 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html)
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 49488c855f930..978b0427125be 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -25,7 +25,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG ab8af5c4b4dc5b40217633e0aa436315912d7b53)
+    set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
@@ -83,7 +83,7 @@ message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}")
 include_directories(${LITE_SOURCE_DIR})
 include_directories(${LITE_BINARY_DIR})
 
-function(external_lite_static_libs alias path)
+function(external_lite_libs alias path)
   add_library(${alias} SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION
                ${path})
@@ -92,7 +92,16 @@ function(external_lite_static_libs alias path)
   endif()
 endfunction()
 
-external_lite_static_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+
+if(XPU_SDK_ROOT)
+  include_directories("${XPU_SDK_ROOT}/XTDK/include")
+  include_directories("${XPU_SDK_ROOT}/XTCL/include")
+  add_definitions(-DPADDLE_WITH_XPU)
+  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/")
+  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/")
+endif()
 
 add_definitions(-DPADDLE_WITH_LITE)
 add_definitions(-DLITE_WITH_LOG)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 5e47f268a3669..5bc7eaaff3abe 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -20,6 +20,8 @@ SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
 SET(CBLAS_REPOSITORY  https://github.com/xianyi/OpenBLAS.git)
 SET(CBLAS_TAG         v0.3.7)
 IF(WITH_ARM)
+    # Under the FT2000 architecture, the calculation result of blas.sgemm in openblas 0.3+ is wrong,
+    # so version 0.2 is used by default.
     SET(CBLAS_TAG v0.2.18)
 ENDIF()
 cache_third_party(extern_openblas
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 04f22d7fc8775..82dd4fa2e8eae 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -145,9 +145,9 @@ if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
     find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         SET(PROTOBUF_FOUND true)
+        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
         SET_PROTOBUF_VERSION()
         PROMPT_PROTOBUF_LIB()
-        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
     endif()
 endif()
 
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index e6a77c38ab5c0..64878693518b6 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -8,6 +8,8 @@ function(CheckCompilerCXX11Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
         if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
             message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
+        elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
+            message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
         endif()
     elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 6fc81f2387b78..5a889dbc31438 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -19,9 +19,12 @@ set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
 set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
   "A path setting fluid inference shared and static libraries")
   
+# TODO(zhaolong)
+# At present, the size of static lib in Windows exceeds the system limit,
+# so the generation of static lib is temporarily turned off.
 if(WIN32)
     #todo: remove the option 
-    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
     if(NOT PYTHON_EXECUTABLE)
         FIND_PACKAGE(PythonInterp REQUIRED)
     endif()
@@ -187,21 +190,18 @@ copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io/crypto/cipher.h
         DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
+
 # CAPI inference library for only inference
 set(FLUID_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_c_install_dir" CACHE STRING
 "A path setting CAPI fluid inference shared")
 copy_part_of_thrid_party(inference_lib_dist ${FLUID_INFERENCE_C_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-if(WIN32)
-    set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/${CMAKE_BUILD_TYPE}/paddle_fluid_c.*)
-else(WIN32)
-    set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
-endif(WIN32)
+set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
 
 copy(inference_lib_dist
-        SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_fluid_c_lib}
-        DSTS  ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/lib)
+      SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_fluid_c_lib}
+      DSTS  ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/lib)
 
 # fluid library for both train and inference
 set(fluid_lib_deps inference_lib_dist)
diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index be84c54fd2fa1..9124fec0b856a 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -7,14 +7,14 @@ if(WIN32)
     return()
 endif()
 
-set(NCCL_ROOT "/usr" CACHE PATH "NCCL ROOT")
-find_path(NCCL_INCLUDE_DIR nccl.h
-    PATHS ${NCCL_ROOT} ${NCCL_ROOT}/include ${NCCL_ROOT}/local/include
-    $ENV{NCCL_ROOT} $ENV{NCCL_ROOT}/include $ENV{NCCL_ROOT}/local/include
-    NO_DEFAULT_PATH
-)
-
 if(WITH_NCCL)
+    set(NCCL_ROOT "/usr" CACHE PATH "NCCL ROOT")
+    find_path(NCCL_INCLUDE_DIR nccl.h
+        PATHS ${NCCL_ROOT} ${NCCL_ROOT}/include ${NCCL_ROOT}/local/include
+        $ENV{NCCL_ROOT} $ENV{NCCL_ROOT}/include $ENV{NCCL_ROOT}/local/include
+        NO_DEFAULT_PATH
+    )
+
     file(READ ${NCCL_INCLUDE_DIR}/nccl.h NCCL_VERSION_FILE_CONTENTS)
 
     string(REGEX MATCH "define NCCL_VERSION_CODE +([0-9]+)"
diff --git a/doc/imgs/logo.png b/doc/imgs/logo.png
new file mode 100644
index 0000000000000..3ed4cc8ec82ee
Binary files /dev/null and b/doc/imgs/logo.png differ
diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h
index 7424bae1ab865..10abb83116624 100644
--- a/paddle/fluid/framework/array.h
+++ b/paddle/fluid/framework/array.h
@@ -63,7 +63,8 @@ class Array {
 
   HOSTDEVICE inline const T &at(size_t i) const {
 #ifndef __CUDA_ARCH__
-    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
+    PADDLE_ENFORCE_LT(
+        i, N, platform::errors::OutOfRange("Array index out of bounds."));
 #endif
     return (*this)[i];
   }
@@ -106,7 +107,7 @@ class Array<T, 0> {
     static T obj();
     return obj;
 #else
-    PADDLE_THROW("Array<T, 0> has no element");
+    PADDLE_THROW(platform::errors::Unavailable("Array<T, 0> has no element."));
 #endif
   }
 
@@ -115,7 +116,7 @@ class Array<T, 0> {
     static const T obj();
     return obj;
 #else
-    PADDLE_THROW("Array<T, 0> has no element");
+    PADDLE_THROW(platform::errors::Unavailable("Array<T, 0> has no element."));
 #endif
   }
 
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index 9f8f17cd1ac68..4c7ef2e600bc1 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -77,11 +77,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   for (auto var_name : fetch_var_names) {
     auto var_desc = block.FindVar(var_name);
     PADDLE_ENFORCE_NOT_NULL(
-        var_desc, platform::errors::NotFound("%s is not found.", var_name));
+        var_desc, platform::errors::NotFound(
+                      "Variable %s is not found in main program.", var_name));
     auto shapes = var_desc->GetShape();
-    PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
-                   "var %s: Fetched var has wrong shape, "
-                   "only variables with the last dimension size 1 supported",
+    PADDLE_ENFORCE_EQ(shapes[shapes.size() - 1], 1,
+        platform::errors::InvalidArgument(
+                   "Fetched variable %s has wrong shape, "
+                   "only variables whose last dimension is 1 are supported",
                    var_name);
   }
 
@@ -95,7 +97,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   actual_thread_num_ = thread_num;
   int file_cnt = filelist.size();
   PADDLE_ENFORCE_GT(file_cnt, 0,
-                    platform::errors::NotFound("Input file list is empty"));
+                    platform::errors::NotFound("Input file list is empty."));
 
   if (actual_thread_num_ > file_cnt) {
     VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt
diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
index fabf2abfc803b..9ca3fe31a33c7 100644
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -72,7 +72,8 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
       return val;
     }
     default:
-      PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
+      PADDLE_THROW(platform::errors::Unavailable("Unsupport attribute type %d.",
+                                                 attr_desc.type()));
   }
   return boost::blank();
 }
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 21bb39b043987..e516ae1efdfc6 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -37,9 +37,10 @@ struct ExtractAttribute {
     try {
       attr_value = &boost::get<T>(attr);
     } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
-                   attr_name_, paddle::platform::demangle(typeid(T).name()),
-                   paddle::platform::demangle(attr.type().name()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Cannot get attribute (%s) by type %s, its type is %s.", attr_name_,
+          paddle::platform::demangle(typeid(T).name()),
+          paddle::platform::demangle(attr.type().name())));
     }
     return attr_value;
   }
@@ -70,8 +71,9 @@ struct ExtractAttribute<bool> {
     try {
       attr_value = &boost::get<bool>(attr);
     } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Cannot get attribute (%s) by type bool, its type is %s.", attr_name_,
+          paddle::platform::demangle(attr.type().name())));
     }
     return attr_value;
   }
@@ -96,8 +98,9 @@ struct ExtractAttribute<int64_t> {
     try {
       attr_value = &boost::get<int64_t>(attr);
     } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Cannot get attribute (%s) by type int64_t, its type is %s.",
+          attr_name_, paddle::platform::demangle(attr.type().name())));
     }
     return attr_value;
   }
@@ -124,8 +127,10 @@ struct ExtractAttribute<std::vector<int64_t>> {
     try {
       attr_value = &boost::get<std::vector<int64_t>>(attr);
     } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Cannot get attribute (%s) by type std::vector<int64_t>, its type is "
+          "%s.",
+          attr_name_, paddle::platform::demangle(attr.type().name())));
     }
     return attr_value;
   }
@@ -150,8 +155,9 @@ struct ExtractAttribute<float> {
     try {
       attr_value = &boost::get<float>(attr);
     } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type float, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Cannot get attribute (%s) by type float, its type is %s.",
+          attr_name_, paddle::platform::demangle(attr.type().name())));
     }
     return attr_value;
   }
@@ -173,8 +179,9 @@ class AttrReader {
 
   template <typename T>
   inline const T& Get(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
-                   name);
+    PADDLE_ENFORCE_NE(attrs_.count(name), 0,
+                      platform::errors::NotFound(
+                          "Attribute (%s) should be in AttributeMap.", name));
 
     Attribute& attr = const_cast<Attribute&>(attrs_.at(name));
     ExtractAttribute<T> extract_attr(name);
@@ -192,8 +199,10 @@ class GreaterThanChecker {
  public:
   explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
   void operator()(const T& value) const {
-    PADDLE_ENFORCE_GT(value, lower_bound_,
-                      platform::errors::OutOfRange("larger_than check fails."));
+    PADDLE_ENFORCE_GT(
+        value, lower_bound_,
+        platform::errors::OutOfRange(
+            "Check for attribute value greater than a certain value failed."));
   }
 
  private:
@@ -205,7 +214,10 @@ class EqualGreaterThanChecker {
  public:
   explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
   void operator()(const T& value) const {
-    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
+    PADDLE_ENFORCE_GE(
+        value, lower_bound_,
+        platform::errors::OutOfRange("Check for attribute valur equal or "
+                                     "greater than a certain value failed."));
   }
 
  private:
@@ -231,9 +243,10 @@ class EnumInContainer {
  public:
   explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
   void operator()(const T& val) const {
-    PADDLE_ENFORCE(container_.find(val) != container_.end(),
-                   "Value %s is not in enum container %s", val,
-                   ContainerDebugString());
+    PADDLE_ENFORCE_NE(
+        container_.find(val), container_.end(),
+        platform::errors::NotFound("Value %s is not in enum container %s.", val,
+                                   ContainerDebugString()));
   }
 
  private:
@@ -284,8 +297,11 @@ class TypedAttrChecker {
   // we can add more common limits, like LessThan(), Between()...
 
   TypedAttrChecker& SetDefault(const T& default_value) {
-    PADDLE_ENFORCE(default_value_setter_.empty(),
-                   "%s can't have more than one default value!", attr_name_);
+    PADDLE_ENFORCE_EQ(
+        default_value_setter_.empty(), true,
+        platform::errors::AlreadyExists(
+            "Attribute (%s) has a default value and cannot be set repeatedly.",
+            attr_name_));
     default_value_setter_.push_back(DefaultValueSetter<T>(default_value));
     return *this;
   }
@@ -308,8 +324,10 @@ class TypedAttrChecker {
     auto it = attr_map->find(attr_name_);
     if (it == attr_map->end()) {
       // user do not set this attr
-      PADDLE_ENFORCE(!default_value_setter_.empty(),
-                     "Attribute '%s' is required!", attr_name_);
+      PADDLE_ENFORCE_EQ(
+          default_value_setter_.empty(), false,
+          platform::errors::InvalidArgument(
+              "Attribute (%s) is not set correctly.", attr_name_));
       // default_value_setter_ has no more than one element
       attr_map->emplace(attr_name_, default_value_setter_[0]());
     }
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index fee6ba4004705..a79bc4bc2cf5f 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -23,7 +23,8 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
 
   PADDLE_ENFORCE_NE(
       in.place().which(), dst_place.which(),
-      "Currently, model parallelism is only supported between CPU and CUDA");
+      platform::errors::Unavailable("Currently, model parallelism is only "
+                                    "supported between CPU and CUDA."));
 
   // NOTE(yy): TransDataDevice should wait for computation of input.
   platform::DeviceContextPool::Instance().Get(in.place())->Wait();
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 566a08d8a2ad1..96d54ec869174 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -133,11 +133,14 @@ bool DataFeed::PickOneFile(std::string* filename) {
 }
 
 void DataFeed::CheckInit() {
-  PADDLE_ENFORCE(finish_init_, "Initialization did not succeed.");
+  PADDLE_ENFORCE_EQ(finish_init_, true, platform::errors::PreconditionNotMet(
+                                            "DataFeed initialization failed."));
 }
 
 void DataFeed::CheckSetFileList() {
-  PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed.");
+  PADDLE_ENFORCE_EQ(
+      finish_set_filelist_, true,
+      platform::errors::PreconditionNotMet("DataFeed set filelist failed."));
 }
 
 void DataFeed::CheckStart() {
@@ -160,14 +163,18 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
 #ifdef PADDLE_WITH_CUDA
     cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
 #else
-    PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Not supported GPU, please compile with option WITH_GPU=ON."));
 #endif
   }
 }
 
 template <typename T>
 void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
-  PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
+  PADDLE_ENFORCE_GT(
+      queue_size, 0,
+      platform::errors::InvalidArgument(
+          "Queue size %d is illegal in PrivateQueueDataFeed.", queue_size));
   queue_size_ = queue_size;
   queue_ = paddle::framework::MakeChannel<T>();
   queue_->SetCapacity(queue_size);
@@ -418,8 +425,10 @@ void MultiSlotDataFeed::Init(
   finish_set_filelist_ = false;
   finish_start_ = false;
 
-  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
-                 "Multi_slot_desc has not been set.");
+  PADDLE_ENFORCE_EQ(
+      data_feed_desc.has_multi_slot_desc(), true,
+      platform::errors::PreconditionNotMet(
+          "Multi_slot_desc has not been set in MultiSlotDataFeed."));
   paddle::framework::MultiSlotDesc multi_slot_desc =
       data_feed_desc.multi_slot_desc();
   SetBatchSize(data_feed_desc.batch_size());
@@ -668,13 +677,14 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
     for (size_t i = 0; i < use_slots_index_.size(); ++i) {
       int idx = use_slots_index_[i];
       int num = strtol(&str[pos], &endptr, 10);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.\nplease check this error line: %s",
-          str);
+      PADDLE_ENFORCE_NE(
+          num, 0,
+          platform::errors::InvalidArgument(
+              "The number of ids can not be zero, you need padding "
+              "it in data generator; or if there is something wrong with "
+              "the data, please check if the data contains unresolvable "
+              "characters.\nplease check this error line: %s.",
+              str));
 
       if (idx != -1) {
         (*instance)[idx].Init(all_slots_type_[i]);
@@ -765,8 +775,10 @@ void MultiSlotInMemoryDataFeed::Init(
   finish_set_filelist_ = false;
   finish_start_ = false;
 
-  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
-                 "Multi_slot_desc has not been set.");
+  PADDLE_ENFORCE_EQ(
+      data_feed_desc.has_multi_slot_desc(), true,
+      platform::errors::PreconditionNotMet(
+          "Multi_slot_desc has not been set in MultiSlotInMemoryDataFeed."));
   paddle::framework::MultiSlotDesc multi_slot_desc =
       data_feed_desc.multi_slot_desc();
   SetBatchSize(data_feed_desc.batch_size());
@@ -898,13 +910,14 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
     for (size_t i = 0; i < use_slots_index_.size(); ++i) {
       int idx = use_slots_index_[i];
       int num = strtol(&str[pos], &endptr, 10);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.\nplease check this error line: %s",
-          str);
+      PADDLE_ENFORCE_NE(
+          num, 0,
+          platform::errors::InvalidArgument(
+              "The number of ids can not be zero, you need padding "
+              "it in data generator; or if there is something wrong with "
+              "the data, please check if the data contains unresolvable "
+              "characters.\nplease check this error line: %s.",
+              str));
       if (idx != -1) {
         if (all_slots_type_[i][0] == 'f') {  // float
           for (int j = 0; j < num; ++j) {
@@ -963,13 +976,14 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
     for (size_t i = 0; i < use_slots_index_.size(); ++i) {
       int idx = use_slots_index_[i];
       int num = strtol(&str[pos], &endptr, 10);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.\nplease check this error line: %s",
-          str);
+      PADDLE_ENFORCE_NE(
+          num, 0,
+          platform::errors::InvalidArgument(
+              "The number of ids can not be zero, you need padding "
+              "it in data generator; or if there is something wrong with "
+              "the data, please check if the data contains unresolvable "
+              "characters.\nplease check this error line: %s.",
+              str));
 
       if (idx != -1) {
         if (all_slots_type_[i][0] == 'f') {  // float
@@ -1085,7 +1099,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
         PADDLE_ENFORCE_EQ(slot_offset.size(), 2,
                           platform::errors::InvalidArgument(
                               "In batch reader, the sparse tensor lod size "
-                              "must be 2, but received %d",
+                              "must be 2, but received %d.",
                               slot_offset.size()));
         const auto& max_size = slot_offset[1];
         tmp_offset.reserve(max_size + 1);
@@ -1137,10 +1151,13 @@ void PrivateInstantDataFeed<T>::PutToFeedVec() {
       for (const auto e : use_slots_shape_[i]) {
         total_dims *= e;
       }
-      PADDLE_ENFORCE(
-          total_dims == total_instance,
-          "The actual data size of slot[%s] doesn't match its declaration",
-          use_slots_[i].c_str());
+      PADDLE_ENFORCE_EQ(
+          total_dims, total_instance,
+          platform::errors::InvalidArgument(
+              "The actual data size of slot[%s] doesn't match its declaration. "
+              "The actual data size of slot is %lld"
+              ", and its declaration is %lld.",
+              use_slots_[i].c_str(), total_dims, total_instance));
       feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
     }
   }
@@ -1162,7 +1179,9 @@ int PrivateInstantDataFeed<T>::Next() {
     return -1;
   }
 
-  PADDLE_ENFORCE(true == ParseOneMiniBatch(), "Fail to parse mini-batch data");
+  PADDLE_ENFORCE_EQ(
+      true, ParseOneMiniBatch(),
+      platform::errors::InvalidArgument("Fail to parse mini-batch data."));
   PutToFeedVec();
   return ins_vec_[0].GetBatchSize();
 }
@@ -1173,8 +1192,10 @@ void PrivateInstantDataFeed<T>::Init(const DataFeedDesc& data_feed_desc) {
   finish_set_filelist_ = false;
   finish_start_ = false;
 
-  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
-                 "Multi_slot_desc has not been set.");
+  PADDLE_ENFORCE_EQ(
+      data_feed_desc.has_multi_slot_desc(), true,
+      platform::errors::PreconditionNotMet(
+          "Multi_slot_desc has not been set in PrivateInstantDataFeed."));
   paddle::framework::MultiSlotDesc multi_slot_desc =
       data_feed_desc.multi_slot_desc();
   SetBatchSize(data_feed_desc.batch_size());
@@ -1217,7 +1238,10 @@ template class PrivateInstantDataFeed<std::vector<MultiSlotType>>;
 
 bool MultiSlotFileInstantDataFeed::Preprocess(const std::string& filename) {
   fd_ = open(filename.c_str(), O_RDONLY);
-  PADDLE_ENFORCE(fd_ != -1, "Fail to open file: %s", filename.c_str());
+  PADDLE_ENFORCE_NE(
+      fd_, -1, platform::errors::Unavailable(
+                   "Fail to open file: %s in MultiSlotFileInstantDataFeed.",
+                   filename.c_str()));
 
   struct stat sb;
   fstat(fd_, &sb);
@@ -1225,7 +1249,11 @@ bool MultiSlotFileInstantDataFeed::Preprocess(const std::string& filename) {
 
   buffer_ =
       reinterpret_cast<char*>(mmap(NULL, end_, PROT_READ, MAP_PRIVATE, fd_, 0));
-  PADDLE_ENFORCE(buffer_ != MAP_FAILED, strerror(errno));
+  PADDLE_ENFORCE_NE(
+      buffer_, MAP_FAILED,
+      platform::errors::Unavailable(
+          "Memory map failed when create shared memory, error number is %s.",
+          strerror(errno)));
 
   offset_ = 0;
   return true;
@@ -1257,12 +1285,13 @@ bool MultiSlotFileInstantDataFeed::ParseOneMiniBatch() {
       char type = all_slots_type_[i][0];
 
       uint16_t num = *reinterpret_cast<uint16_t*>(buffer_ + offset_);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.");
+      PADDLE_ENFORCE_NE(
+          num, 0,
+          platform::errors::InvalidArgument(
+              "The number of ids can not be zero, you need padding "
+              "it in data generator; or if there is something wrong with "
+              "the data, please check if the data contains unresolvable "
+              "characters."));
       offset_ += sizeof(uint16_t);
 
       if (idx != -1) {
@@ -1304,7 +1333,12 @@ bool MultiSlotFileInstantDataFeed::ParseOneMiniBatch() {
   }
 
   PADDLE_ENFORCE(batch_size_ == default_batch_size_ || offset_ == end_,
-                 "offset_ != end_");
+                 platform::errors::InvalidArgument(
+                     "The batch size id not equal to default batch size, or "
+                     "the offset is not equal to end index."
+                     "The batch size is %d, default batcch size is %d, offset "
+                     "is %d, end index is %d.",
+                     batch_size_, default_batch_size_, offset_, end_));
   return true;
 }
 #endif
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index ef49b28cdbc81..b48d152fe3582 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -116,7 +116,8 @@ class DataFeed {
   virtual ~DataFeed() {}
   virtual void Init(const DataFeedDesc& data_feed_desc) = 0;
   virtual bool CheckFile(const char* filename) {
-    PADDLE_THROW("This function(CheckFile) is not implemented.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This function(CheckFile) is not implemented."));
   }
   // Set filelist for DataFeed.
   // Pay attention that it must init all readers before call this function.
@@ -179,7 +180,8 @@ class DataFeed {
   }
   virtual int GetCurBatchSize() { return batch_size_; }
   virtual void LoadIntoMemory() {
-    PADDLE_THROW("This function(LoadIntoMemory) is not implemented.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This function(LoadIntoMemory) is not implemented."));
   }
   virtual void SetPlace(const paddle::platform::Place& place) {
     place_ = place;
@@ -438,14 +440,23 @@ class MultiSlotType {
 
  private:
   void CheckType(const std::string& type) const {
-    PADDLE_ENFORCE((type == "uint64") || (type == "float"),
-                   "There is no this type<%s>.", type);
+    PADDLE_ENFORCE_EQ((type == "uint64" || type == "float"), true,
+                      platform::errors::InvalidArgument(
+                          "MultiSlotType error, expect type is uint64 or "
+                          "float, but received type is %s.",
+                          type));
   }
   void CheckFloat() const {
-    PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_);
+    PADDLE_ENFORCE_EQ(
+        type_[0], 'f',
+        platform::errors::InvalidArgument(
+            "MultiSlotType error, add %s value to float slot.", type_));
   }
   void CheckUint64() const {
-    PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_);
+    PADDLE_ENFORCE_EQ(
+        type_[0], 'u',
+        platform::errors::InvalidArgument(
+            "MultiSlotType error, add %s value to uint64 slot.", type_));
   }
   std::vector<float> float_feasign_;
   std::vector<uint64_t> uint64_feasign_;
diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc
index 9a055765b8c91..2cc441bbd34cb 100644
--- a/paddle/fluid/framework/data_feed_test.cc
+++ b/paddle/fluid/framework/data_feed_test.cc
@@ -34,8 +34,10 @@ paddle::framework::DataFeedDesc load_datafeed_param_from_file(
     const char* filename) {
   paddle::framework::DataFeedDesc data_feed_desc;
   int file_descriptor = open(filename, O_RDONLY);
-  PADDLE_ENFORCE_NE(file_descriptor, -1, platform::errors::Unavailable(
-                                             "Cannot open file %s.", filename));
+  PADDLE_ENFORCE_NE(
+      file_descriptor, -1,
+      platform::errors::Unavailable(
+          "Cannot open file %s c load datafeed param from file.", filename));
   google::protobuf::io::FileInputStream fileInput(file_descriptor);
   google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc);
   close(file_descriptor);
@@ -45,8 +47,10 @@ paddle::framework::DataFeedDesc load_datafeed_param_from_file(
 const std::vector<std::string> load_filelist_from_file(const char* filename) {
   std::vector<std::string> filelist;
   std::ifstream fin(filename);
-  PADDLE_ENFORCE_EQ(fin.good(), true, platform::errors::Unavailable(
-                                          "Cannot open file %s.", filename));
+  PADDLE_ENFORCE_EQ(
+      fin.good(), true,
+      platform::errors::Unavailable(
+          "Cannot open file %s when load filelist from file.", filename));
   std::string line;
   while (getline(fin, line)) {
     filelist.push_back(line);
@@ -196,7 +200,8 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
                 }
               }
             } else {
-              PADDLE_THROW("Error type in proto file.");
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "Error type in proto file."));
             }
           } else {  // sparse branch
             if (slot.type() == "uint64") {
@@ -218,7 +223,8 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
                 }
               }
             } else {
-              PADDLE_THROW("Error type in proto file.");
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "Error type in proto file."));
             }
           }  // end sparse branch
           ++index;
@@ -272,7 +278,10 @@ void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
   file_elem_set->resize(used_slot_num);
   for (const auto& file : filelist) {
     std::ifstream fin(file.c_str());
-    PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str());
+    PADDLE_ENFORCE_EQ(
+        fin.good(), true,
+        platform::errors::Unavailable(
+            "Can not open %s when get element set from file.", file.c_str()));
     while (1) {
       bool end_flag = false;
       int index = 0;
@@ -298,7 +307,8 @@ void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
               }
             }
           } else {
-            PADDLE_THROW("Error type in proto file.");
+            PADDLE_THROW(
+                platform::errors::InvalidArgument("Error type in proto file."));
           }
           if (slot.is_used()) {
             ++index;
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
index b611bb77b4e1e..947f06408d028 100644
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -45,7 +45,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
   } else if (s == "MKLDNNLAYOUT") {
     return DataLayout::kMKLDNN;
   } else {
-    PADDLE_THROW("Unknown storage order string: %s", s);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unknown data layout type string: %s.", s));
   }
 }
 
@@ -60,7 +61,8 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) {
     case DataLayout::kMKLDNN:
       return "MKLDNNLAYOUT";
     default:
-      PADDLE_THROW("unknown DataLayout %d", data_layout);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unknown Data Layout type %d.", data_layout));
   }
 }
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 59a76ce103c0e..3cea7a66d0105 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -25,14 +25,17 @@ namespace paddle {
 namespace framework {
 
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
-  PADDLE_ENFORCE_NE(from, to,
-                    "layout transform should transform different layout");
+  PADDLE_ENFORCE_NE(
+      from, to,
+      platform::errors::InvalidArgument(
+          "Layout transform should transform between different layout."));
   if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) {
     return {0, 2, 3, 1};
   } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) {
     return {0, 3, 1, 2};
   } else {
-    PADDLE_THROW("unsupported transform");
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Unsupported layout transform."));
   }
 }
 
@@ -55,7 +58,8 @@ struct CastDataLayout {
       auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
       trans4(*context, in_, out_, axis_);
     } else {
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Unsupported data layout cast from CPU to GPU."));
     }
   }
 };
@@ -66,9 +70,14 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
   PADDLE_ENFORCE(
       platform::places_are_same_class(kernel_type_for_var.place_,
                                       expected_kernel_type.place_),
-      "TransDataLayout only support DataLayout transform on same place!");
+      platform::errors::PreconditionNotMet(
+          "TransDataLayout only support DataLayout transform on same place."));
 
-  PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!");
+  PADDLE_ENFORCE_EQ(
+      arity(in.dims()), 4,
+      platform::errors::InvalidArgument(
+          "Input dimension arity only can be 4, the input dimension is %s.",
+          in.dims()));
 
   auto& pool = platform::DeviceContextPool::Instance();
 
@@ -108,7 +117,8 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
     case mkldnn::memory::data_type::s32:
       return platform::to_void_cast(tensor.data<int32_t>());
     default:
-      PADDLE_THROW("wrong mkldnn type provided");
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Wrong mkldnn type provided."));
   }
 }
 
@@ -121,8 +131,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
 
   PADDLE_ENFORCE(
       in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
-      "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
-      "non-MKLDNN");
+      platform::errors::InvalidArgument(
+          "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
+          "non-MKLDNN"));
 
   innerTransDataLayoutFromMKLDNN(
       in_layout,
@@ -155,7 +166,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
 
   memory::data_type in_type = ToMKLDNNDataType(in.type());
   PADDLE_ENFORCE_NE(in_type, memory::data_type::undef,
-                    "Input tensor type is not supported: %s", in.type());
+                    platform::errors::InvalidArgument(
+                        "Input tensor type (%s) is not supported.",
+                        DataTypeToString(in.type())));
 
   auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
   auto out_format =
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 711146efd267b..6eb84ef9d7c01 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -38,8 +38,9 @@ inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) {
     case DataLayout::kNCHW:
       return MKLDNNMemoryFormat::nchw;
     default:
-      PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
-                   DataLayoutToString(layout));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Fail to convert layout %s to MKLDNN format.",
+          DataLayoutToString(layout)));
   }
 }
 
@@ -50,7 +51,8 @@ inline DataLayout ToPaddleLayout(const MKLDNNMemoryFormat& format) {
     case MKLDNNMemoryFormat::nchw:
       return DataLayout::kNCHW;
     default:
-      PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Fail to convert MKLDNN format to paddle layout."));
   }
 }
 
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 76c53e8231577..f54311eebfade 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -45,9 +45,10 @@ void TransformData(const OpKernelType &expected_kernel_type,
   if (NeedTransformLayout(lout, lin)) {
 #ifdef PADDLE_WITH_MKLDNN
     if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) {
-      PADDLE_ENFORCE(
-          !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN),
-          "No layout transform needed between two MKLDNN OPKernels");
+      PADDLE_ENFORCE_EQ(
+          !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN), true,
+          platform::errors::PreconditionNotMet(
+              "No layout transform needed between two MKLDNN OPKernels."));
 
       if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) {
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
@@ -96,7 +97,10 @@ void TransformData(const OpKernelType &expected_kernel_type,
     PassTensorData(&out, &in);
   }
 
-  PADDLE_ENFORCE(transformed, "No transform is applied, please check!");
+  PADDLE_ENFORCE_EQ(
+      transformed, true,
+      platform::errors::PreconditionNotMet(
+          "No transform is applied for the data needs to be transformed."));
   // get output data
   output_tensor->ShareDataWith(in);
 }
@@ -116,7 +120,10 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
     trans_selected_rows->set_rows(in_selected_rows.rows());
     trans_selected_rows->mutable_value()->ShareDataWith(tensor);
   } else {
-    PADDLE_THROW("unknown var type");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported variable type, only supports LoDTensor or SelectedRows, "
+        "but the input variable type is %s.",
+        ToTypeName(in_var.Type())));
   }
 }
 
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index a0248cf3c7569..f479d92483c1c 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -65,7 +65,8 @@ proto::VarType::Type ToDataType(std::type_index type) {
   if (it != gDataTypeMap().cpp_to_proto_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support %s as tensor type", type.name());
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Not support %s as tensor data type.", platform::demangle(type.name())));
 }
 
 std::type_index ToTypeIndex(proto::VarType::Type type) {
@@ -73,8 +74,9 @@ std::type_index ToTypeIndex(proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_cpp_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Not support proto::VarType::Type(%d) as tensor type.",
+      static_cast<int>(type)));
 }
 
 std::string DataTypeToString(const proto::VarType::Type type) {
@@ -82,8 +84,9 @@ std::string DataTypeToString(const proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_str_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Not support proto::VarType::Type(%d) as tensor type.",
+      static_cast<int>(type)));
 }
 
 size_t SizeOfType(proto::VarType::Type type) {
@@ -91,7 +94,8 @@ size_t SizeOfType(proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_size_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type));
+  PADDLE_THROW(platform::errors::Unimplemented("Not support %s as tensor type.",
+                                               DataTypeToString(type)));
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index e3b45d05d85e9..2c4a7b4d02727 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -78,7 +78,9 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
 
   _ForEachDataType_(VisitDataTypeCallback);
 #undef VisitDataTypeCallback
-  PADDLE_THROW("Not supported %d", type);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Not supported proto::VarType::Type(%d) as data type.",
+      static_cast<int>(type)));
 }
 
 template <typename Visitor>
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index d79f8cacb5f47..44542f05d9d5c 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -56,7 +56,8 @@ struct CastDataType {
       context->Wait();
 #endif
     } else {
-      PADDLE_THROW("Unsupported place!");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Place type is not supported when casting data type."));
     }
   }
 };
@@ -98,7 +99,9 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
       framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
       break;
     default:
-      PADDLE_THROW("Not support type %d", src_type);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          DataTypeToString(src_type)));
   }
 }
 
diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
index 799deec1b6955..fe7d243066237 100644
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -81,9 +81,11 @@ bool contain_unknown_dim(const DDim& ddim) {
 }
 
 DDim slice_ddim(const DDim& dim, int begin, int end) {
-  PADDLE_ENFORCE(begin >= 0 && end <= dim.size(),
-                 "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
-                 begin, end, dim.size());
+  PADDLE_ENFORCE_EQ(
+      (begin >= 0 && end <= dim.size()), true,
+      platform::errors::InvalidArgument(
+          "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.", begin,
+          end, dim.size()));
   // Constructor of DDim would check whether end - begin is valid
   return DDim(dim.Get() + begin, end - begin);
 }
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
index cbc8b0fb7cc78..29c4732f99118 100644
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -29,20 +29,23 @@ namespace framework {
     return (callback);                         \
   }
 
-#define PADDLE_VISIT_DDIM(rank, callback)    \
-  switch (rank) {                            \
-    PADDLE_VISIT_DDIM_BASE(0, callback);     \
-    PADDLE_VISIT_DDIM_BASE(1, callback);     \
-    PADDLE_VISIT_DDIM_BASE(2, callback);     \
-    PADDLE_VISIT_DDIM_BASE(3, callback);     \
-    PADDLE_VISIT_DDIM_BASE(4, callback);     \
-    PADDLE_VISIT_DDIM_BASE(5, callback);     \
-    PADDLE_VISIT_DDIM_BASE(6, callback);     \
-    PADDLE_VISIT_DDIM_BASE(7, callback);     \
-    PADDLE_VISIT_DDIM_BASE(8, callback);     \
-    PADDLE_VISIT_DDIM_BASE(9, callback);     \
-    default:                                 \
-      PADDLE_THROW("Invalid rank %d", rank); \
+#define PADDLE_VISIT_DDIM(rank, callback)                                  \
+  switch (rank) {                                                          \
+    PADDLE_VISIT_DDIM_BASE(0, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(1, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(2, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(3, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(4, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(5, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(6, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(7, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(8, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(9, callback);                                   \
+    default:                                                               \
+      PADDLE_THROW(platform::errors::Unimplemented(                        \
+          "Invalid dimension to be accessed. Now only supports access to " \
+          "dimension 0 to 9, but received dimension is %d.",               \
+          rank));                                                          \
   }
 
 template <typename T1, typename T2>
@@ -92,13 +95,31 @@ class DDim {
 
   inline int64_t operator[](int idx) const { return dim_[idx]; }
 
-  inline int64_t& at(int idx) {
-    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+  int64_t& at(int idx) {
+    PADDLE_ENFORCE_GE(idx, 0,
+                      platform::errors::InvalidArgument(
+                          "Invalid DDim index to be accessed. The valid index "
+                          "is between 0 and %d, but received index is %d.",
+                          rank_, idx));
+    PADDLE_ENFORCE_LT(idx, rank_,
+                      platform::errors::InvalidArgument(
+                          "Invalid DDim index to be accessed. The valid index "
+                          "is between 0 and %d, but received index is %d.",
+                          rank_, idx));
     return dim_[idx];
   }
 
-  inline int64_t at(int idx) const {
-    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+  int64_t at(int idx) const {
+    PADDLE_ENFORCE_GE(idx, 0,
+                      platform::errors::InvalidArgument(
+                          "Invalid DDim index to be accessed. The valid index "
+                          "is between 0 and %d, but received index is %d.",
+                          rank_, idx));
+    PADDLE_ENFORCE_LT(idx, rank_,
+                      platform::errors::InvalidArgument(
+                          "Invalid DDim index to be accessed. The valid index "
+                          "is between 0 and %d, but received index is %d.",
+                          rank_, idx));
     return dim_[idx];
   }
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 9bcd79cd34f07..cc7d60b148def 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -33,22 +33,27 @@ message DistributedStrategy {
   optional int32 localsgd_k_step = 7 [ default = 4 ];
   optional bool dgc = 8 [ default = false ];
   optional bool hierachical_allreduce = 9 [ default = false ];
-  optional int32 nccl_comm_num = 10 [ default = 1 ];
-  optional bool gradient_merge = 11 [ default = false ];
-  optional int32 gradient_merge_k_step = 12 [ default = 1 ];
-  optional bool sequential_execution = 13 [ default = false ];
-  optional bool enable_backward_optimizer_op_deps = 14 [ default = true ];
-  optional bool lars = 15 [ default = false ];
-  optional bool lamb = 16 [ default = false ];
-  optional bool fuse_elewise_add_act_ops = 17 [ default = false ];
-  optional bool fuse_bn_act_ops = 18 [ default = false ];
-  optional bool enable_auto_fusion = 19 [ default = false ];
-  optional bool fuse_relu_depthwise_conv = 20 [ default = false ];
-  optional bool enable_inplace = 21 [ default = false ];
-  optional bool fuse_all_reduce_ops = 22 [ default = false ];
-  optional int32 num_iteration_per_drop_scope = 23 [ default = 1 ];
-  optional bool sync_batch_norm = 24 [ default = false ];
-  optional bool fuse_all_optimizer_ops = 25 [ default = false ];
+  optional int32 hierachical_allreduce_inter_ranks = 10 [ default = 1 ];
+  optional int32 nccl_comm_num = 11 [ default = 1 ];
+  optional bool gradient_merge = 12 [ default = false ];
+  optional int32 gradient_merge_k_step = 13 [ default = 1 ];
+  optional bool sequential_execution = 14 [ default = false ];
+  optional bool enable_backward_optimizer_op_deps = 15 [ default = true ];
+  optional bool lars = 16 [ default = false ];
+  optional bool lamb = 17 [ default = false ];
+  optional bool fuse_elewise_add_act_ops = 18 [ default = false ];
+  optional bool fuse_bn_act_ops = 19 [ default = false ];
+  optional bool enable_auto_fusion = 20 [ default = false ];
+  optional bool fuse_relu_depthwise_conv = 21 [ default = false ];
+  optional bool enable_inplace = 22 [ default = false ];
+  optional bool fuse_all_reduce_ops = 23 [ default = false ];
+  optional int32 num_iteration_per_drop_scope = 24 [ default = 1 ];
+  optional bool sync_batch_norm = 25 [ default = false ];
+  optional bool fuse_all_optimizer_ops = 26 [ default = false ];
+  optional bool sync_nccl_allreduce = 27 [ default = true ];
+  optional bool fuse_broadcast_ops = 28 [ default = true ];
+  optional int32 num_threads = 29 [ default = 1 ];
+  optional int32 num_iteration_per_run = 30 [ default = 1 ];
 
   // pipeline training
   optional bool pipeline = 101 [ default = false ];
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 74e344cfebe36..f2421248e33f2 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -30,7 +30,10 @@ static ::DLDataType GetDLDataTypeCode() {
   } else if (std::is_integral<T>::value) {
     dtype.code = kDLInt;
   } else {
-    PADDLE_THROW("Unsupported data type %s", typeid(T).name());
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported data type (%s), only supports float16, float, unsigned "
+        "int and int.",
+        platform::demangle(typeid(T).name())));
   }
   dtype.bits = 8 * sizeof(T);
   dtype.lanes = 1;
@@ -52,8 +55,9 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
   static auto type_to_dtype_map = CreateDLDataTypeMap();
   static auto type_to_dtype_map_end_it = type_to_dtype_map.end();
   auto it = type_to_dtype_map.find(static_cast<int>(type));
-  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %d",
-                 type);
+  PADDLE_ENFORCE_NE(it, type_to_dtype_map_end_it,
+                    platform::errors::InvalidArgument(
+                        "Unsupported data type (%s).", DataTypeToString(type)));
   return it->second;
 #undef REG_DL_DATA_TYPE
 }
@@ -73,7 +77,8 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
     ctx.device_id = place.device;
     return ctx;
 #else
-    PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "platform::CUDAPlace is not supported in CPU only version."));
 #endif
   }
 
@@ -84,8 +89,8 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
     ctx.device_id = 0;
     return ctx;
 #else
-    PADDLE_THROW(
-        "platform::CUDAPinnedPlace is not supported in CPU only version");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "platform::CUDAPinnedPlace is not supported in CPU only version."));
 #endif
   }
 };
@@ -136,7 +141,10 @@ ::DLManagedTensor *DLPackTensor::ToCudfCompatibleDLManagedTensor() {
   // refer to cupy and cudf, the compact tensor first dim's strides need to be 1
   // and second dim's strides need to be length of rows of cudf
   // cudf now only support dim=2
-  PADDLE_ENFORCE_LE(t_.ndim, 2, "cudf now only support dim=2.");
+  PADDLE_ENFORCE_LE(t_.ndim, 2, platform::errors::InvalidArgument(
+                                    "cudf now only supports dimension is 2, "
+                                    "but received dimension is %d.",
+                                    t_.ndim));
 
   if (t_.ndim > 1)
     t_.strides = new int64_t[2]{1, t_.shape[1]};
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index cbdfa00652abd..3f70835c9d312 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -556,9 +556,11 @@ void DownpourWorker::TrainFilesWithProfiler() {
         continue;
       }
       PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor), false,
-                        "Tensor %s contains Inf", var_name);
+                        platform::errors::InvalidArgument(
+                            "Tensor %s contains Inf.", var_name));
       PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor), false,
-                        "Tensor %s contains NAN", var_name);
+                        platform::errors::InvalidArgument(
+                            "Tensor %s contains NAN.", var_name));
     }
 
     if (need_to_push_sparse_) {
@@ -829,9 +831,11 @@ void DownpourWorker::TrainFiles() {
         continue;
       }
       PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor), false,
-                        "Tensor %s contains Inf", var_name);
+                        platform::errors::InvalidArgument(
+                            "Tensor %s contains Inf.", var_name));
       PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor), false,
-                        "Tensor %s contains NAN", var_name);
+                        platform::errors::InvalidArgument(
+                            "Tensor %s contains NAN.", var_name));
     }
 
     if (need_to_push_sparse_) {
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 21adcb9948b20..0e3edfb95cb9b 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -26,7 +26,11 @@ struct EigenDim {
   using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
 
   static Type From(const DDim& dims) {
-    PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
+    PADDLE_ENFORCE_EQ(arity(dims), D,
+                      platform::errors::InvalidArgument(
+                          "Input dimension size should be equal to %d, but "
+                          "received dimension size is %d.",
+                          arity(dims), D));
     Type ret;
     for (int64_t d = 0; d < arity(dims); d++) {
       ret[d] = dims[d];
@@ -69,8 +73,11 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
   static typename EigenMatrix::Type Reshape(Tensor& tensor,  // NOLINT
                                             int num_col_dims) {
     int rank = tensor.dims_.size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank), true,
+                      platform::errors::InvalidArgument(
+                          "Input dimension number(num_col_dims) must be "
+                          "between 0 and %d, but received number is %d.",
+                          rank, num_col_dims));
     return EigenMatrix::From(tensor,
                              flatten_to_2d(tensor.dims(), num_col_dims));
   }
@@ -78,8 +85,11 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
   static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
                                                  int num_col_dims) {
     int rank = tensor.dims_.size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank), true,
+                      platform::errors::InvalidArgument(
+                          "Input dimension number(num_col_dims) must be "
+                          "between 0 and %d, but received number is %d.",
+                          rank, num_col_dims));
     return EigenMatrix::From(tensor,
                              flatten_to_2d(tensor.dims(), num_col_dims));
   }
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index 1712d66cf4c99..706248229bc27 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -175,8 +175,9 @@ void DeleteUnusedTensors(
         garbages.emplace_back(t.MoveMemoryHolder());
       }
     } else {
-      PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   framework::ToTypeName(var->Type()), var_name);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Type %s of variable %s is not supported eager deletion.",
+          framework::ToTypeName(var->Type()), var_name));
     }
   }
 
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 08c3e6d7f592d..ac892443de36c 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -79,15 +79,15 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
                                                size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {
   platform::CUDADeviceGuard guard(place.device);
-  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
   callback_manager_.reset(new platform::StreamCallbackManager(stream_));
 }
 
 StreamGarbageCollector::~StreamGarbageCollector() {
   auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace());
   platform::CUDADeviceGuard guard(place.device);
-  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
 }
 
 cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
diff --git a/paddle/fluid/framework/io/shell.h b/paddle/fluid/framework/io/shell.h
index 5b3e9a4df1d11..dc486275d6f58 100644
--- a/paddle/fluid/framework/io/shell.h
+++ b/paddle/fluid/framework/io/shell.h
@@ -17,6 +17,9 @@
 #include <fcntl.h>
 #include <sys/stat.h>
 #ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
 #include <windows.h>
 #else
 #include <sys/syscall.h>
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index a56fcd1a52339..a4b43086785b3 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -135,7 +135,9 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
 
 void PrepareParameters(Graph* graph, const Param& param, ir::Node* lstm_op) {
   // Check parameters
-  PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+  PADDLE_ENFORCE_EQ(graph->Has(kParamScopeAttr), true,
+                    platform::errors::InvalidArgument(
+                        "Graph have no attribute: kParamScopeAttr."));
   auto& scope = graph->Get<Scope>(kParamScopeAttr);
 
   // Create new parameters.
@@ -193,7 +195,10 @@ void PrepareParameters(Graph* graph, const Param& param, ir::Node* lstm_op) {
   // reshape attention_bias
   auto* attention_bias_t =
       scope.FindVar(param.AttentionBias)->GetMutable<LoDTensor>();
-  PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1);
+  PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1,
+                    platform::errors::InvalidArgument(
+                        "Tensor attention bias dimension size(%d) must be 1.",
+                        attention_bias_t->dims().size()));
   attention_bias_t->Resize(make_ddim({1, attention_bias_t->dims()[0]}));
 
   auto* attention_scalar_bias_t =
@@ -252,7 +257,10 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
       B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
       B_cell.data<float>()};
 
-  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
+  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1,
+                    platform::errors::InvalidArgument(
+                        "Tensor B forget dimension size(%d) must be 1.",
+                        B_forget.dims().size()));
   int D = B_forget.dims()[0];
   out->Resize(make_ddim({1, 4 * D}));
   auto* out_data = out->mutable_data<float>(platform::CPUPlace());
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
index d7faf2ee64833..f3634f90e6c69 100644
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
@@ -119,9 +119,11 @@ class CoalesceGradTensorPass : public ir::Pass {
       p_g_dense_grad.insert(p_g_dense_grad.end(), group_p_g.begin(),
                             group_p_g.end());
     }
-    PADDLE_ENFORCE_EQ(
-        p_g_dense_grad.size(), num_of_p_g_dense_grad,
-        "The number of p_g_dense_grad is not consistent with before.");
+    PADDLE_ENFORCE_EQ(p_g_dense_grad.size(), num_of_p_g_dense_grad,
+                      platform::errors::InvalidArgument(
+                          "The number of dense grads is not consistent with "
+                          "previous. Previous(%d), now(%d).",
+                          p_g_dense_grad.size(), num_of_p_g_dense_grad));
 
     auto &pinned_var_set =
         graph->GetOrInit<details::PinnedVars>(details::kPinnedVars);
@@ -131,8 +133,11 @@ class CoalesceGradTensorPass : public ir::Pass {
     } else {
       for (auto &sub_param_grad : group_params_grads) {
         RecordGradients(p_g_dense_grad, vars_info, &pinned_var_set);
-        PADDLE_ENFORCE_EQ(IsUnifiedDtype(sub_param_grad, vars_info), true,
-                          "The data type of the same group is not consistent.");
+        PADDLE_ENFORCE_EQ(
+            IsUnifiedDtype(sub_param_grad, vars_info), true,
+            platform::errors::InvalidArgument("All gradient variable in "
+                                              "kGroupParamsAndDenseGrads, must "
+                                              "have same type."));
         CoalesceTensors(vars_info, sub_param_grad, &result);
       }
     }
@@ -145,15 +150,25 @@ class CoalesceGradTensorPass : public ir::Pass {
     // The Gradients should not be reused during memory optimization.
     for (auto &p_g : sub_param_grad) {
       auto iter = vars_info.find(p_g.second);
-      PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, "%s is not found.",
-                        p_g.second);
-      PADDLE_ENFORCE_EQ(!iter->second.empty(), true);
+      PADDLE_ENFORCE_EQ(iter != vars_info.end(), true,
+                        platform::errors::NotFound(
+                            "Parameter@Grad %s is not found.", p_g.second));
+      PADDLE_ENFORCE_EQ(
+          !iter->second.empty(), true,
+          platform::errors::InvalidArgument(
+              "Parameter@Grad %s's var node is empty.", p_g.second));
       for (auto it : iter->second) {
-        PADDLE_ENFORCE_NOT_NULL(it->Var());
+        PADDLE_ENFORCE_NOT_NULL(
+            it->Var(),
+            platform::errors::InvalidArgument(
+                "A node of Parameter@Grad %s does not hold variable.",
+                p_g.second));
         pinned_var_set->insert(it->Var()->Name());
       }
       PADDLE_ENFORCE_EQ(IsLoDTensorType(GetTypeOfVar(vars_info, p_g.second)),
-                        true);
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Parameter@Grad %s is not LoDTensor.", p_g.second));
     }
   }
 
@@ -192,8 +207,10 @@ class CoalesceGradTensorPass : public ir::Pass {
     auto fused_grad_var_name = std::string(details::kFusedVarNamePrefix) +
                                "@GRAD@" + params_grads.begin()->second;
     auto &fused_var_set = result->Get<details::FusedVars>(details::kFusedVars);
-    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_grad_var_name), 0,
-                      "%s is duplicate in FusedVars.", fused_grad_var_name);
+    PADDLE_ENFORCE_EQ(
+        fused_var_set.count(fused_grad_var_name), 0,
+        platform::errors::AlreadyExists("Var(%s) is duplicate in FusedVars.",
+                                        fused_grad_var_name));
     fused_var_set.insert(fused_grad_var_name);
     result->Get<details::FusedGrads>(details::kFusedGrads)
         .emplace_back(fused_grad_var_name);
@@ -420,11 +437,16 @@ class CoalesceGradTensorPass : public ir::Pass {
       const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
       const std::string &var_name) const {
     auto grad_iter = vars_info.find(var_name);
-    PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, "%s is not found.",
-                      var_name);
-    PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, "%s is not found.",
-                      var_name);
-    PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
+    PADDLE_ENFORCE_EQ(
+        grad_iter != vars_info.end(), true,
+        platform::errors::NotFound("Variable %s is not found.", var_name));
+    PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "Variable %s's node is empty.", var_name));
+    PADDLE_ENFORCE_NOT_NULL(
+        grad_iter->second.front()->Var(),
+        platform::errors::InvalidArgument(
+            "A node of %s does not hold variable.", var_name));
     return grad_iter->second.front()->Var();
   }
 
@@ -464,7 +486,12 @@ class CoalesceGradTensorPass : public ir::Pass {
       params_name.emplace_back(p_g.first);
       grads_name.emplace_back(p_g.second);
       auto next_dtype = GetDtypeOfVar(vars_info, p_g.second);
-      PADDLE_ENFORCE_EQ(next_dtype, dtype);
+      PADDLE_ENFORCE_EQ(
+          next_dtype, dtype,
+          platform::errors::InvalidArgument(
+              "All Parameter@Grad should have same dtype, but "
+              "there are two different type: %s, %s.",
+              DataTypeToString(next_dtype), DataTypeToString(dtype)));
     }
 
     result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index fecc159adef19..079fb1479861c 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -50,7 +50,12 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
       Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
 
   // Re-compute bias of conv2d from AffineChannel
-  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
+  PADDLE_ENFORCE_EQ(
+      eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
+      platform::errors::InvalidArgument(
+          "Tensor elementwise y(%d) and activation bias(%d) must have same "
+          "dimension.",
+          eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
 
   auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
 
@@ -78,11 +83,13 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
 }
 
 void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
@@ -152,11 +159,13 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
 }
 
 void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 7313ef2cc35dd..60e4ac8cbcfd8 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -61,7 +61,12 @@ void recompute_bias_and_weights(const Scope* scope,
       Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
 
   // Re-compute bias of conv2d from BN
-  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
+  PADDLE_ENFORCE_EQ(
+      eltwise_y_in_tensor->dims(), bn_bias_tensor.dims(),
+      platform::errors::InvalidArgument("Tensor elementwise y(%d) and batch "
+                                        "norm bias(%d) must have same dims.",
+                                        eltwise_y_in_tensor->dims().size(),
+                                        bn_bias_tensor.dims().size()));
 
   auto* scale_tensor = scope->FindVar(bn_scale.Name())->GetMutable<LoDTensor>();
   auto* variance_tensor =
@@ -116,11 +121,13 @@ void recompute_bias_and_weights(const Scope* scope,
 }
 
 void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
@@ -186,11 +193,18 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
       if (has_bias && conv->Op()->Input("Bias").size() > 0) {
         // reuse existing conv bias node
         auto conv_bias_names = conv->Op()->Input("Bias");
-        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL);
+        PADDLE_ENFORCE_EQ(
+            conv_bias_names.size(), 1UL,
+            platform::errors::InvalidArgument("Find input var Bais error."));
         auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
         auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
-        PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
-                          eltwise_y_in_tensor->dims());
+        PADDLE_ENFORCE_EQ(
+            conv_bias_tensor->dims(), eltwise_y_in_tensor->dims(),
+            platform::errors::InvalidArgument(
+                "Tensor convolution bias(%d) and elementwise y(%d) "
+                "must have same dims.",
+                conv_bias_tensor->dims().size(),
+                eltwise_y_in_tensor->dims().size()));
 
         auto eigen_conv_bias = EigenVector<float>::From(*conv_bias_tensor);
         eigen_conv_bias += EigenVector<float>::From(*eltwise_y_in_tensor);
@@ -236,11 +250,13 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
 }
 
 void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
index 168d0afb26d98..74dd6a7cdc5a6 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
@@ -71,8 +71,16 @@ void TestMain(const std::string& conv_type) {
   int num_bn_nodes_after = GetNumOpNodes(graph, "batch_norm");
   VLOG(3) << DebugString(graph);
 
-  PADDLE_ENFORCE_EQ(num_bn_nodes_before, 1);
-  PADDLE_ENFORCE_EQ(num_bn_nodes_after, 0);
+  PADDLE_ENFORCE_EQ(
+      num_bn_nodes_before, 1,
+      platform::errors::InvalidArgument(
+          "Before conv_bn_fuse_pass, number of batch norm op(%d) must be 1.",
+          num_bn_nodes_before));
+  PADDLE_ENFORCE_EQ(
+      num_bn_nodes_after, 0,
+      platform::errors::InvalidArgument(
+          "After conv_bn_fuse_pass, number of batch norm op(%d) must be 0.",
+          num_bn_nodes_after));
 }
 
 TEST(ConvBNFusePass, conv2d) { TestMain("conv"); }
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index b00be79a2a7da..2627da7dc40f1 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -91,7 +91,9 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
     auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
 
     // Link inputs and outputs.
-    PADDLE_ENFORCE(subgraph.count(x));
+    PADDLE_ENFORCE_NE(
+        subgraph.count(x), 0,
+        platform::errors::NotFound("Detector did not find input x of conv2d."));
     auto* conv_in_node = subgraph.at(x);
 
     IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index b15871ef03fbb..0b454a0407e48 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -78,7 +78,9 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
     auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
 
     // Link inputs and outputs.
-    PADDLE_ENFORCE(subgraph.count(x));
+    PADDLE_ENFORCE_NE(
+        subgraph.count(x), 0,
+        platform::errors::NotFound("Detector did not find input x of conv2d."));
     auto* conv_in_node = subgraph.at(x);
 
     IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index 8c491d4f58b4d..007770cf57d27 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -66,7 +66,9 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
     auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
 
     // Link inputs and outputs.
-    PADDLE_ENFORCE(subgraph.count(x));
+    PADDLE_ENFORCE_NE(
+        subgraph.count(x), 0,
+        platform::errors::NotFound("Detector did not find input x of conv2d."));
     auto* conv_in_node = subgraph.at(x);
 
     IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index 85e2f2bad323f..c50b7476c6a96 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -64,17 +64,23 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 #undef SET_IN
 
     // Multiply embeddings with Weights
-    PADDLE_ENFORCE(scope);
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
     const std::string& embeddings = patterns::UniqueKey("Embeddings");
     auto* embeddings_var = scope->Var(embeddings);
-    PADDLE_ENFORCE(embeddings_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        embeddings_var,
+        platform::errors::InvalidArgument(
+            "Embeddings variable's pointer cannot be nullptr."));
     auto* embeddings_tensor =
         embeddings_var->GetMutable<framework::LoDTensor>();
     // Get WeightX size: [single_embedding, fc_size]
     // and embedding size: [dict_size, single_embedding]
     // and create new size of embeddings eg. [dict_size , hidden_size]
     auto* embedding_var = scope->FindVar(W->Name());
-    PADDLE_ENFORCE(embedding_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        embedding_var, platform::errors::InvalidArgument(
+                           "Embedding variable's pointer cannot be nullptr."));
     const auto& embedding_tensor = embedding_var->Get<framework::LoDTensor>();
 
     const auto& weightx_tensor =
@@ -90,7 +96,9 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 
     // Adding biases to GEMM result to be
     auto* lstm_bias_var = scope->FindVar(bias->Name());
-    PADDLE_ENFORCE(lstm_bias_var);
+    PADDLE_ENFORCE_NOT_NULL(lstm_bias_var,
+                            platform::errors::InvalidArgument(
+                                "Lstm bias var ptr cannot be nullptr."));
     const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
 
     auto alpha = 1.0f;
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
index c1f822d7ca5cd..51e9545bf92e8 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
@@ -56,8 +56,17 @@ TEST(FCElementwiseLayerNormFusePass, basic) {
       GetNumOpNodes(graph, "fused_fc_elementwise_layernorm");
   VLOG(3) << DebugString(graph);
 
-  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6);
-  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1);
+  PADDLE_ENFORCE_EQ(
+      num_nodes_before, num_nodes_after + 6,
+      platform::errors::InvalidArgument(
+          "After pass, the number of nodes should be reduced by 6, but the "
+          "number before pass is %d, after pass is %d.",
+          num_nodes_before, num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1,
+                    platform::errors::InvalidArgument(
+                        "After pass, the number of nodes of type "
+                        "'fused_fc_elementwise_layernorm' should be 1, not %d.",
+                        num_fused_nodes_after));
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 6a9c64e3a7f24..066a8fb975740 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -25,7 +25,8 @@ namespace framework {
 namespace ir {
 
 void FCFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("fc_fuse", graph);
 
   int found_fc_count = 0;
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index dfae572d4634e..cf35c1ac772da 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -79,9 +79,17 @@ TEST(FCFusePass, basic) {
   int num_fc_nodes_after = GetNumOpNodes(graph, "fc");
   VLOG(3) << DebugString(graph);
 
-  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6);
-  PADDLE_ENFORCE_EQ(num_fc_nodes_after, 2);
-  PADDLE_ENFORCE_EQ(num_mul_nodes_before, num_fc_nodes_after);
+  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6,
+                    platform::errors::InvalidArgument(
+                        "num_nodes_before=%d, num_nodes_after=%d.",
+                        num_nodes_before, num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fc_nodes_after, 2,
+                    platform::errors::InvalidArgument("num_fc_nodes_after=%d.",
+                                                      num_fc_nodes_after));
+  PADDLE_ENFORCE_EQ(num_mul_nodes_before, num_fc_nodes_after,
+                    platform::errors::InvalidArgument(
+                        "num_mul_nodes_before=%d, num_fc_nodes_after=%d.",
+                        num_mul_nodes_before, num_fc_nodes_after));
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index d26998e6fc99d..a2185cdc5593c 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -26,15 +26,15 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
 
-  // Create pattern.
-  patterns::FC fc_pattern(pattern, name_scope);
-  patterns::GRU gru_pattern(pattern, name_scope);
-
   PDNode* x =
       pattern->NewNode(patterns::UniqueKey("x"))->assert_var_not_persistable();
 
+  // Create pattern.
+  patterns::FC fc_pattern(pattern, name_scope);
   auto* fc_out = fc_pattern(x, with_fc_bias, /* with_relu */ false);
   fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
+
+  patterns::GRU gru_pattern(pattern, name_scope);
   gru_pattern(fc_out);
 
   // Create New OpDesc
@@ -48,17 +48,18 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
     SET_IN(X, x);
     SET_IN(WeightX, weight_x);
     SET_IN(WeightH, weight_h);
-    if (with_fc_bias) {
-      op_desc.SetInput("Bias", {NEW_NAME(bias) + bias->Name()});
-    } else {
-      SET_IN(Bias, bias);
-    }
+    SET_IN(Bias, bias);
 #undef SET_IN
+    // TODO(grygielski): Add H0 to the pass
     op_desc.SetInput("H0", {});
     op_desc.SetOutput("Hidden", {hidden->Name()});
     op_desc.SetAttr("is_reverse", gru->Op()->GetAttr("is_reverse"));
+    op_desc.SetAttr("origin_mode",
+                    gru->Op()->GetAttrIfExists<bool>("origin_mode"));
     // TODO(TJ): This should be a option for infer
     op_desc.SetAttr("use_seq", true);
+    op_desc.SetAttr("activation", gru->Op()->GetAttr("activation"));
+    op_desc.SetAttr("gate_activation", gru->Op()->GetAttr("gate_activation"));
 
 #define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
     SET_IMTERMEDIATE_OUT(ReorderedH0);
@@ -68,26 +69,30 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 #undef SET_IMTERMEDIATE_OUT
 
     auto* op = graph->CreateOpNode(&op_desc);
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
     if (with_fc_bias) {
-      // Fusion GRU bias = fcbias + grubias
-      auto* fusion_bias_var = scope.Var(NEW_NAME(bias) + bias->Name());
-      auto* out_bias_tensor =
-          fusion_bias_var->GetMutable<framework::LoDTensor>();
-      PADDLE_ENFORCE(fusion_bias_var);
-      auto* gru_bias_var = scope.FindVar(bias->Name());
-      auto* fc_bias_var = scope.FindVar(fc_bias->Name());
-      PADDLE_ENFORCE(gru_bias_var);
-      PADDLE_ENFORCE(fc_bias_var);
-      const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
-      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
-      // new bias = fc bias + gru bias
-      out_bias_tensor->Resize(gru_bias_tenosr.dims());
-      auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
-      for (int i = 0; i < out_bias_tensor->numel(); i++) {
-        data[i] =
-            fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
+      auto* gru_bias_var = scope->FindVar(bias->Name());
+      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
+      PADDLE_ENFORCE_NE(
+          gru_bias_var, nullptr,
+          platform::errors::NotFound("GRU bias var has not been found."));
+      PADDLE_ENFORCE_NE(
+          fc_bias_var, nullptr,
+          platform::errors::NotFound("FC bias var has not been found."));
+
+      auto* gru_bias_tensor = gru_bias_var->GetMutable<LoDTensor>();
+      auto* fc_bias_tensor = fc_bias_var->GetMutable<LoDTensor>();
+      PADDLE_ENFORCE_EQ(
+          gru_bias_tensor->numel(), fc_bias_tensor->numel(),
+          platform::errors::PreconditionNotMet(
+              "GRU and FC biases have to have equal number of elements."));
+
+      auto gru_bias_data =
+          gru_bias_tensor->mutable_data<float>(platform::CPUPlace());
+      auto* fc_bias_data = fc_bias_tensor->data<float>();
+
+      // Recompute GRU bias
+      for (int i = 0; i < gru_bias_tensor->numel(); ++i) {
+        gru_bias_data[i] += fc_bias_data[i];
       }
     }
 #undef GET_NODE
@@ -108,7 +113,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
     IR_NODE_LINK_TO(x, op);
     IR_NODE_LINK_TO(weight_x, op);
     IR_NODE_LINK_TO(weight_h, op);
-    IR_NODE_LINK_TO(bias, op);  // actually should link to new bias if have
+    IR_NODE_LINK_TO(bias, op);
     IR_NODE_LINK_TO(op, hidden);
     // h0?
     return op;
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 44306a729544d..12c7fc051e23a 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -52,13 +52,17 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
 #undef SET_IN
     if (with_fc_bias) {
       // Add FC-bias with LSTM-bias and create a new weight
-      PADDLE_ENFORCE(scope);
+      PADDLE_ENFORCE_NOT_NULL(
+          scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
       const std::string& new_bias_var = patterns::UniqueKey("NewBias");
       auto* bias_var = scope->Var(new_bias_var);
-      PADDLE_ENFORCE(bias_var);
+      PADDLE_ENFORCE_NOT_NULL(bias_var, platform::errors::InvalidArgument(
+                                            "Bias var ptr cannot be nullptr."));
       auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
       auto* lstm_bias_var = scope->FindVar(bias->Name());
-      PADDLE_ENFORCE(lstm_bias_var);
+      PADDLE_ENFORCE_NOT_NULL(lstm_bias_var,
+                              platform::errors::InvalidArgument(
+                                  "Lstm bias var ptr cannot be nullptr."));
       const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
       bias_tensor->Resize(lstm_bias_tensor.dims());
 
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index 7d6ef5b9023b0..54c05046a2c2f 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -320,7 +320,7 @@ std::vector<Node *> FuseBatchNormActPass::ReplaceNode(
                    return node;
                  });
   PADDLE_ENFORCE_EQ(has_replaced, true,
-                    platform::errors::NotFound("Not find %s in the node list.",
+                    platform::errors::NotFound("Not found %s in the node list.",
                                                cur_node->Name()));
   return new_list;
 }
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 5c2c574fd681a..b559d66fe7456 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -42,7 +42,8 @@ void FuseElewiseAddActPass::ApplyImpl(ir::Graph *graph) const {
 // ele_add(x, act(y))
 ir::Graph *FuseElewiseAddActPass::FuseElewiseAddAct(
     ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("elewise_add_act", graph);
 
   GraphPatternDetector gpd;
@@ -93,7 +94,8 @@ ir::Graph *FuseElewiseAddActPass::FuseElewiseAddAct(
 // act(ele_add(x,y))
 ir::Graph *FuseElewiseAddActPass::FuseActElewiseAdd(
     ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("act_elewise_add", graph);
 
   GraphPatternDetector gpd;
@@ -145,7 +147,8 @@ ir::Graph *FuseElewiseAddActPass::FuseActElewiseAdd(
 // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
 ir::Graph *FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
     ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("elewise_add_act_grad", graph);
 
   GraphPatternDetector gpd;
@@ -252,10 +255,11 @@ void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const {
       bool save_intermediate_out = BOOST_GET_CONST(
           bool, cur_node->Op()->GetAttr("save_intermediate_out"));
       auto intermediate_out_args = cur_node->Op()->Output("IntermediateOut");
-      PADDLE_ENFORCE(
-          save_intermediate_out && !intermediate_out_args.empty(),
-          "The %s should save the intermediate_out in the fusing stage.",
-          cur_node->Name());
+      PADDLE_ENFORCE_EQ(
+          (save_intermediate_out && !intermediate_out_args.empty()), true,
+          platform::errors::InvalidArgument(
+              "The %s should save the intermediate out in the fusing stage.",
+              cur_node->Name()));
 
       // If the intermediate_out's output is empty, it should be removed.
       auto cur_node_outputs = cur_node->outputs;
@@ -271,10 +275,11 @@ void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const {
     } else if (cur_node->Name() == "fused_elemwise_activation_grad") {
       auto intermediate_out_grad_args =
           cur_node->Op()->Output(GradVarName("IntermediateOut"));
-      PADDLE_ENFORCE(
-          !intermediate_out_grad_args.empty(),
-          "The %s should save the intermediate_out in the fusing stage.",
-          cur_node->Name());
+      PADDLE_ENFORCE_EQ(
+          intermediate_out_grad_args.empty(), false,
+          platform::errors::InvalidArgument(
+              "The %s should save the intermediate out in the fusing stage.",
+              cur_node->Name()));
       auto cur_node_outputs = cur_node->outputs;
       // If the intermediate_out_g's output is empty, it should be removed.
       for (auto &out : cur_node_outputs) {
@@ -312,7 +317,11 @@ void FuseElewiseAddActPass::ReLinkNodes(Graph *graph,
         nodes2delete.emplace(out);
       }
     } else {
-      PADDLE_ENFORCE(out == intermediate_out);
+      PADDLE_ENFORCE_EQ(
+          out, intermediate_out,
+          platform::errors::InvalidArgument(
+              "Output of op(%s) must be %s, but not %s.", op_1->Name(),
+              intermediate_out->Name(), out->Name()));
       IR_OP_VAR_LINK(fused_op, out);
     }
   }
@@ -347,8 +356,9 @@ std::vector<Node *> FuseElewiseAddActPass::ReplaceNode(
                    }
                    return node;
                  });
-  PADDLE_ENFORCE(has_replaced, "Not find %s in the node list.",
-                 cur_node->Name());
+  PADDLE_ENFORCE_EQ(has_replaced, true,
+                    platform::errors::NotFound("Not found %s in the node list.",
+                                               cur_node->Name()));
   return new_list;
 }
 
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
index 482d8cf3d2f19..c284c1f4587cd 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
@@ -50,18 +50,25 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
                                  fused_scale2->inputs.end());
     for (auto &out_node : fused_scale1->outputs) {
       if (fused_scale2_in_nodes.count(out_node)) {
-        PADDLE_ENFORCE(out_node->IsCtrlVar(),
-                       "The dependency var only should be ctrl var.");
+        PADDLE_ENFORCE_EQ(out_node->IsCtrlVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "In adam op pass, the dependency var(%s) only "
+                              "should be ctrl var.",
+                              out_node->Name()));
         not_need_ctrl_var_nodes.insert(out_node);
       }
     }
 
     for (auto &node : not_need_ctrl_var_nodes) {
       // remove this node from the input op node.
-      PADDLE_ENFORCE(!node->inputs.empty(),
-                     "The input should not be empty here.");
+      PADDLE_ENFORCE_EQ(
+          node->inputs.empty(), false,
+          platform::errors::PreconditionNotMet(
+              "Node(%s)'s input should not be empty here.", node->Name()));
       auto op_node = node->inputs.front();
-      PADDLE_ENFORCE(op_node->IsOp());
+      PADDLE_ENFORCE_EQ(op_node->IsOp(), true,
+                        platform::errors::PreconditionNotMet(
+                            "Node(%s) should be an OP node.", op_node->Name()));
       op_node->outputs.erase(
           remove_if(
               op_node->outputs.begin(), op_node->outputs.end(),
@@ -85,7 +92,9 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
-    PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
+    PADDLE_ENFORCE_GT(
+        adam_ops.size(), static_cast<size_t>(0),
+        platform::errors::InvalidArgument("No adam op in the graph."));
 
     // Check attributions
     // NOTE: If new attribution is added, the following code maybe need change.
@@ -102,22 +111,58 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
         int64_t, adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
     for (auto &adam_op : adam_ops) {
       PADDLE_ENFORCE_EQ(
-          beta1, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta1")));
+          beta1, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta1")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(beta1) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              beta1, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta1"))));
       PADDLE_ENFORCE_EQ(
-          beta2, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta2")));
+          beta2, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta2")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(beta2) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              beta2, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta2"))));
       PADDLE_ENFORCE_EQ(
-          epsilon, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("epsilon")));
+          epsilon, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("epsilon")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(epsilon) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              epsilon,
+              BOOST_GET_CONST(float, adam_op->Op()->GetAttr("epsilon"))));
       PADDLE_ENFORCE_EQ(
-          lazy_mode,
-          BOOST_GET_CONST(bool, adam_op->Op()->GetAttr("lazy_mode")));
+          lazy_mode, BOOST_GET_CONST(bool, adam_op->Op()->GetAttr("lazy_mode")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(lazy_mode) must be same, but there are two "
+              "different "
+              "value: %d, %d.",
+              lazy_mode,
+              BOOST_GET_CONST(bool, adam_op->Op()->GetAttr("lazy_mode"))));
       PADDLE_ENFORCE_EQ(
           min_row_size_to_use_multithread,
           BOOST_GET_CONST(int64_t, adam_op->Op()->GetAttr(
-                                       "min_row_size_to_use_multithread")));
+                                       "min_row_size_to_use_multithread")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(min_row_size_to_use_multithread) must be "
+              "same, but there are two different value: %I64, %I64.",
+              min_row_size_to_use_multithread,
+              BOOST_GET_CONST(
+                  int64_t,
+                  adam_op->Op()->GetAttr("min_row_size_to_use_multithread"))));
       PADDLE_ENFORCE_EQ(
           op_role,
           BOOST_GET_CONST(int, adam_op->Op()->GetAttr(
-                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(op_role) must be same, but there are two "
+              "different "
+              "value: %d, %d.",
+              op_role,
+              BOOST_GET_CONST(int,
+                              adam_op->Op()->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName()))));
     }
 
     // NOTE: fused_var is only exist in scope, so the graph doesn't have
@@ -154,7 +199,10 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
                          const std::string &fused_var_name,
                          const std::vector<ir::Node *> &adam_ops,
                          ir::Graph *graph) const {
-    PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
+    PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size(),
+                      platform::errors::InvalidArgument(
+                          "Beta name size(%d) must equal to adam op size(%d).",
+                          beta_name.size(), adam_ops.size()));
     const std::string scale_op_name = "scale";
 
     // Get the scale_ops of dealing the adam's beta var.
@@ -168,7 +216,9 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
             return var_node->Var() &&
                    var_node->Var()->Name() == beta_1_pow_name;
           });
-      PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
+      PADDLE_ENFORCE_NE(beta_pow_iter, adam_ops[i]->inputs.end(),
+                        platform::errors::NotFound(
+                            "Can not find %s in adam ops.", beta_1_pow_name));
 
       auto beta_pow_node = *beta_pow_iter;
       auto scale_op_iter = std::find_if(
@@ -176,11 +226,18 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
           [&scale_op_name](ir::Node *op_node) -> bool {
             return op_node->Op() && op_node->Op()->Type() == scale_op_name;
           });
-      PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
+      PADDLE_ENFORCE_NE(
+          scale_op_iter, beta_pow_node->outputs.end(),
+          platform::errors::NotFound("Can not find %s in beta pow node.",
+                                     scale_op_name));
 
       scale_ops.emplace_back(*scale_op_iter);
     }
-    PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
+    PADDLE_ENFORCE_EQ(
+        scale_ops.size(), beta_name.size(),
+        platform::errors::PreconditionNotMet(
+            "Beta name size(%d) must equal to scale ops size(%d).",
+            beta_name.size(), scale_ops.size()));
     VLOG(6) << "The number of scale op is " << scale_ops.size() << ".";
     // Check attributions
     // NOTE: If new attribution is added, the following code maybe need change.
@@ -193,16 +250,40 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
         BOOST_GET_CONST(bool, scale_ops[0]->Op()->GetAttr("bias_after_scale"));
     for (auto &scale_op : scale_ops) {
       PADDLE_ENFORCE_EQ(
-          scale, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("scale")));
+          scale, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("scale")),
+          platform::errors::PreconditionNotMet(
+              "All scale Op's attr(scale) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              scale, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("scale"))));
       PADDLE_ENFORCE_EQ(
-          bias, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("bias")));
+          bias, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("bias")),
+          platform::errors::PreconditionNotMet(
+              "All scale Op's attr(bias) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              bias, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("bias"))));
       PADDLE_ENFORCE_EQ(
           bias_after_scale,
-          BOOST_GET_CONST(bool, scale_op->Op()->GetAttr("bias_after_scale")));
+          BOOST_GET_CONST(bool, scale_op->Op()->GetAttr("bias_after_scale")),
+          platform::errors::PreconditionNotMet(
+              "All scale Op's attr(bias_after_scale) must be same, but there "
+              "are two different value: %d, %d.",
+              bias_after_scale,
+              BOOST_GET_CONST(bool,
+                              scale_op->Op()->GetAttr("bias_after_scale"))));
       PADDLE_ENFORCE_EQ(
           op_role,
           BOOST_GET_CONST(int, scale_op->Op()->GetAttr(
-                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())),
+          platform::errors::PreconditionNotMet(
+              "All scale Op's attr(op_role) must be same, but there are two "
+              "different "
+              "value: %d, %d.",
+              op_role,
+              BOOST_GET_CONST(int,
+                              scale_op->Op()->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName()))));
     }
 
     // NOTE: fused_var is only exist in scope, so the graph doesn't have
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
index f70745be1bd60..43ec8bff5edc1 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
@@ -37,7 +37,9 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass {
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &momentum_ops, ir::Graph *graph) const {
-    PADDLE_ENFORCE_GT(momentum_ops.size(), static_cast<size_t>(0));
+    PADDLE_ENFORCE_GT(
+        momentum_ops.size(), static_cast<size_t>(0),
+        platform::errors::InvalidArgument("Momentum ops must not be empyt."));
 
     // Check attributions
     // NOTE: If new attribution is added, the following code maybe need change.
@@ -50,14 +52,32 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass {
 
     for (auto &momentum_op : momentum_ops) {
       PADDLE_ENFORCE_EQ(
-          mu, BOOST_GET_CONST(float, momentum_op->Op()->GetAttr("mu")));
+          mu, BOOST_GET_CONST(float, momentum_op->Op()->GetAttr("mu")),
+          platform::errors::InvalidArgument(
+              "All momentum Op's attr(mu) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              mu, BOOST_GET_CONST(float, momentum_op->Op()->GetAttr("mu"))));
       PADDLE_ENFORCE_EQ(
           use_nesterov,
-          BOOST_GET_CONST(bool, momentum_op->Op()->GetAttr("use_nesterov")));
+          BOOST_GET_CONST(bool, momentum_op->Op()->GetAttr("use_nesterov")),
+          platform::errors::InvalidArgument(
+              "All momentum Op's attr(use_nesterov) must be same, but there "
+              "are two different value: %d, %d.",
+              use_nesterov, BOOST_GET_CONST(bool, momentum_op->Op()->GetAttr(
+                                                      "use_nesterov"))));
       PADDLE_ENFORCE_EQ(
           op_role,
           BOOST_GET_CONST(int, momentum_op->Op()->GetAttr(
-                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())),
+          platform::errors::InvalidArgument(
+              "All momentum Op's attr(op_role) must be same, but there are two "
+              "different "
+              "value: %d, %d.",
+              op_role,
+              BOOST_GET_CONST(int,
+                              momentum_op->Op()->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName()))));
     }
 
     // NOTE: fused_var is only exist in scope, so the graph doesn't have
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index 35bdfde96bc3c..fa86db891f881 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -41,10 +41,12 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
   for (auto &node : topo_nodes) {
     if (node->Op()->Type() == fuse_op_type) {
       auto grad_name = node->Op()->Input(kGrad);
-      PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1),
-                        "The %s operator has multiple gradient input. Expected "
-                        "it to only have one gradient input.",
-                        fuse_op_type);
+      PADDLE_ENFORCE_EQ(
+          grad_name.size(), static_cast<size_t>(1),
+          platform::errors::InvalidArgument(
+              "The %s operator has multiple gradient input. Expected "
+              "it to only have one gradient input.",
+              fuse_op_type));
       if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) {
         opt_nodes.emplace_back(node);
       }
@@ -96,7 +98,8 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
     VLOG(6) << var_name << ": " << fused_var_name;
     PADDLE_ENFORCE_EQ(
         fused_var_set.count(fused_var_name), 0,
-        platform::errors::AlreadyExists("The fused variable already exists."));
+        platform::errors::AlreadyExists(
+            "The fused variable(%s) already exists.", fused_var_name));
     fused_var_set.insert(fused_var_name);
     fused_vars_name.emplace(var_name, fused_var_name);
   }
@@ -110,7 +113,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
         result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
     PADDLE_ENFORCE_LE(
         params_and_dense_grads.size(), aux_var_map.at(kGrad).size(),
-        "The number of dense gradients should be little than optimizer ops.");
+        platform::errors::InvalidArgument(
+            "The number of dense gradients(%d) should be "
+            "little than optimizer ops(%d).",
+            params_and_dense_grads.size(), aux_var_map.at(kGrad).size()));
 
     std::unordered_set<std::string> opt_grad_set(aux_var_map.at(kGrad).size());
     for (auto &p_g : params_and_dense_grads) {
@@ -130,13 +136,14 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
     // some gradient's name maybe changed.
     if (new_grad_idx.size() == 0) {
       if (!result.Has(details::kFusedGrads)) {
-        PADDLE_THROW(
+        PADDLE_THROW(platform::errors::PreconditionNotMet(
             "The coalesce_grad_tensor_pass should "
-            "be called before this pass.");
+            "be called before this pass."));
       }
       auto &fused_grad = result.Get<details::FusedGrads>(details::kFusedGrads);
       PADDLE_ENFORCE_NE(fused_grad.size(), 0,
-                        "The fused gradient should not be empty.");
+                        platform::errors::NotFound(
+                            "The fused gradient should not be empty."));
       if (fused_grad.size() > 1) {
         // Note(chenweihang): Because the dtype of those gradients is not
         //   unified,so the number of fused gradients is more than one,
@@ -146,8 +153,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
       auto &fused_vars = result.Get<details::FusedVars>(details::kFusedVars);
       auto iter =
           std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front());
-      PADDLE_ENFORCE_EQ(iter != fused_vars.end(), true,
-                        "Not found the fused gradient variable.");
+      PADDLE_ENFORCE_EQ(
+          iter != fused_vars.end(), true,
+          platform::errors::NotFound("Not found the fused gradient variable."));
       fused_vars_name[kGrad] = fused_grad.front();
 
       // Sort the parameters and auxiliary variables according
@@ -334,16 +342,24 @@ void FuseOptimizerOpPass::FuseGradientsToContinuousSpace(
   // The Gradients should not be reused during memory optimization.
   for (auto &grad_var_name : grads) {
     auto iter = vars_info.find(grad_var_name);
-    PADDLE_ENFORCE_EQ(iter != vars_info.end(), true,
-                      "The gradient variable %s is not found.", grad_var_name);
-    PADDLE_ENFORCE_EQ(!iter->second.empty(), true,
-                      "The gradient var node %s is not found.", grad_var_name);
-    PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var(),
-                            "The gradient var node is null.");
+    PADDLE_ENFORCE_EQ(
+        iter != vars_info.end(), true,
+        platform::errors::NotFound("The gradient variable %s is not found.",
+                                   grad_var_name));
+    PADDLE_ENFORCE_EQ(
+        !iter->second.empty(), true,
+        platform::errors::NotFound("The gradient var node %s is not found.",
+                                   grad_var_name));
+    PADDLE_ENFORCE_NOT_NULL(
+        iter->second.front()->Var(),
+        platform::errors::InvalidArgument("The gradient var(%s) node is null.",
+                                          grad_var_name));
     PADDLE_ENFORCE_EQ(
         IsLoDTensorType(iter->second.front()->Var()->GetType()), true,
-        "Currently the gradient type only should be LoDTensor when "
-        "fusing optimizer ops.");
+        platform::errors::InvalidArgument(
+            "Currently the gradient(%s) type only should be LoDTensor when "
+            "fusing optimizer ops.",
+            grad_var_name));
     for (auto var : iter->second) {
       pinned_var_set.insert(var->Var()->Name());
     }
@@ -382,11 +398,14 @@ const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo(
     const std::string &var_name) const {
   auto grad_iter = vars_info.find(var_name);
   PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true,
-                    "The gradient variable %s is not found.", var_name);
+                    platform::errors::NotFound(
+                        "The gradient variable %s is not found.", var_name));
   PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true,
-                    "The gradient var node %s is not found.", var_name);
+                    platform::errors::NotFound(
+                        "The gradient var node %s is not found.", var_name));
   PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var(),
-                          "The gradient var node is null.");
+                          platform::errors::InvalidArgument(
+                              "The gradient var(%s) node is null.", var_name));
   return grad_iter->second.front()->Var();
 }
 
@@ -428,8 +447,9 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
     const std::vector<std::pair<std::string, std::string>> &params_grads,
     std::unordered_map<std::string, std::vector<std::string>> *aux_var_map,
     std::vector<ir::Node *> *ops) const {
-  PADDLE_ENFORCE_NE(aux_var_map->count(kGrad), static_cast<size_t>(0),
-                    "The gradient variable doesn‘t exist.");
+  PADDLE_ENFORCE_NE(
+      aux_var_map->count(kGrad), static_cast<size_t>(0),
+      platform::errors::NotFound("The gradient variable doesn‘t exist."));
   auto &grad_vec = aux_var_map->at(kGrad);
 
   std::vector<size_t> grad_sort_idx;
@@ -437,8 +457,10 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
 
   for (auto &p_g : params_grads) {
     auto iter = std::find(grad_vec.begin(), grad_vec.end(), p_g.second);
-    PADDLE_ENFORCE_EQ(iter != grad_vec.end(), true,
-                      "%s is not found in gradient vector", p_g.second);
+    PADDLE_ENFORCE_EQ(
+        iter != grad_vec.end(), true,
+        platform::errors::NotFound(
+            "Parameter@Grad(%s) is not found in gradient vector.", p_g.second));
     auto idx = std::distance(grad_vec.begin(), iter);
     grad_sort_idx.emplace_back(idx);
   }
@@ -477,9 +499,10 @@ void FuseOptimizerOpPass::GetFusingVarNamesMap(
     for (auto &var_n : aux_vars_name) {
       auto arg_names = node->Op()->Input(var_n);
       PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1),
-                        "The input variable of optimizer to be fused is "
-                        "invalid. Excepted %s only has one %s input.",
-                        node->Op()->Type(), var_n);
+                        platform::errors::InvalidArgument(
+                            "The input variable of optimizer to be fused is "
+                            "invalid. Excepted %s only has one %s input.",
+                            node->Op()->Type(), var_n));
       (*aux_args_name)[var_n].emplace_back(arg_names[0]);
     }
   }
@@ -525,10 +548,14 @@ void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode(
   auto deal_with_ctrl_vars = [&out_dep_vars, &not_useful_vars,
                               &fused_opt_node](ir::Node *ctr_var_node) {
     PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1,
-                      "The control var node has nultiple inputs.");
+                      platform::errors::InvalidArgument(
+                          "The control var(%s) node has multiple inputs.",
+                          ctr_var_node->Name()));
     if (ctr_var_node->inputs.front() == fused_opt_node) {
-      PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0,
-                        "The control var node has no output.");
+      PADDLE_ENFORCE_GT(
+          ctr_var_node->outputs.size(), 0,
+          platform::errors::InvalidArgument(
+              "The control var(%s) node has no output.", ctr_var_node->Name()));
       auto output_ops = ctr_var_node->outputs;
       output_ops.erase(std::remove_if(output_ops.begin(), output_ops.end(),
                                       [&fused_opt_node](const ir::Node *node) {
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
index 1504f00b27cd6..70d4d2b865230 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
@@ -35,7 +35,9 @@ class FuseSgdOpPass : public FuseOptimizerOpPass {
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
-    PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
+    PADDLE_ENFORCE_GT(
+        sgd_ops.size(), static_cast<size_t>(0),
+        platform::errors::InvalidArgument("SGD ops must not be empyt."));
 
     // NOTE: fused_var is only exist in scope, so the graph doesn't have
     // fused_var node.
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc
index c7bf53f3d6119..e6fb1302e275f 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
@@ -25,14 +25,19 @@ void FusePassBase::Init(const std::string& repr, Graph* graph) const {
 }
 
 Scope* FusePassBase::param_scope() const {
-  PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
+  PADDLE_ENFORCE_EQ(graph_->Has(kParamScopeAttr), true,
+                    platform::errors::InvalidArgument(
+                        "Graph must have kParamScopeAttr attribute."));
   auto& scope = graph_->Get<framework::Scope>(kParamScopeAttr);
   return &scope;
 }
 
 void FusePassBase::AddStatis(int count_of_fused) const {
-  PADDLE_ENFORCE(graph_);
-  PADDLE_ENFORCE(!repr_.empty());
+  PADDLE_ENFORCE_NOT_NULL(
+      graph_, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  PADDLE_ENFORCE_EQ(repr_.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "Fuse pass must be initialized with a name."));
   if (!graph_->Has(kFuseStatisAttr)) {
     graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
   }
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
index c4e6b6e6a52ec..56ca98b566070 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -31,7 +31,8 @@ void FuseReluDepthwiseConvPass::ApplyImpl(ir::Graph *graph) const {
 
 ir::Graph *FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
     ir::Graph *graph, bool only_forward) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   if (only_forward)
     FusePassBase::Init("relu_depthwise_conv_only_forward", graph);
   else
@@ -110,23 +111,45 @@ ir::Graph *FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
       xg_var = subgraph.at(xg)->Var();
     }
 
-    PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL);
-    PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name());
+    PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL,
+                      platform::errors::InvalidArgument(
+                          "Op(%s)'s input size(%d) must be 1.",
+                          layer_op->Type(), layer_op->Input("Input").size()));
+    PADDLE_ENFORCE_EQ(
+        layer_op->Input("Input")[0], y_var->Name(),
+        platform::errors::InvalidArgument(
+            "Op(%s)'s input name(%s) must be %s.", layer_op->Type(),
+            layer_op->Input("Input")[0], y_var->Name()));
     layer_op->SetInput("Input", {x_var->Name()});
     subgraph.at(layer)->inputs.push_back(subgraph.at(x));
     subgraph.at(x)->outputs.push_back(subgraph.at(layer));
     VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name();
 
     if (!only_forward) {
-      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1UL);
-      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name());
+      PADDLE_ENFORCE_EQ(
+          layer_g_op->Input("Input").size(), 1UL,
+          platform::errors::InvalidArgument(
+              "Op(%s)'s input size(%d) must be 1.", layer_g_op->Type(),
+              layer_g_op->Input("Input").size()));
+      PADDLE_ENFORCE_EQ(
+          layer_g_op->Input("Input")[0], y_var->Name(),
+          platform::errors::InvalidArgument(
+              "Op(%s)'s input name(%s) must be %s.", layer_g_op->Type(),
+              layer_g_op->Input("Input")[0], y_var->Name()));
       layer_g_op->SetInput("Input", {x_var->Name()});
       subgraph.at(layer_g)->inputs.push_back(subgraph.at(x));
       subgraph.at(x)->outputs.push_back(subgraph.at(layer_g));
 
-      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1UL);
-      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0],
-                        yg_var->Name());
+      PADDLE_ENFORCE_EQ(
+          layer_g_op->Output(GradVarName("Input")).size(), 1UL,
+          platform::errors::InvalidArgument(
+              "Op(%s)'s input size(%d) must be 1.", layer_g_op->Type(),
+              layer_g_op->Output(GradVarName("Input")).size()));
+      PADDLE_ENFORCE_EQ(
+          layer_g_op->Output(GradVarName("Input"))[0], yg_var->Name(),
+          platform::errors::InvalidArgument(
+              "Op(%s)'s input name(%s) must be %s.", layer_g_op->Type(),
+              layer_g_op->Output(GradVarName("Input"))[0], yg_var->Name()));
       layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()});
       subgraph.at(layer_g)->outputs.push_back(subgraph.at(xg));
       subgraph.at(xg)->inputs.push_back(subgraph.at(layer_g));
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index b397216f0b4d1..ff0e0e65a297f 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -136,7 +136,9 @@ bool FindCircleSubGraph(const Graph &graph,
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
   std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
       adj_list = BuildOperationAdjList(graph);
-  PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr));
+  PADDLE_ENFORCE_EQ(HasCircleInternal(adj_list, nullptr), false,
+                    platform::errors::InvalidArgument(
+                        "Generated graph shouldn't contain cycle."));
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
   for (auto adj : adj_list) {
@@ -161,7 +163,11 @@ BuildOperationAdjList(const Graph &graph) {
     }
     for (auto &var : n->inputs) {
       for (auto &adj_n : var->inputs) {
-        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        PADDLE_ENFORCE_EQ(
+            adj_n->NodeType(), ir::Node::Type::kOperation,
+            platform::errors::InvalidArgument(
+                "Node(%s)'s type(%d) must be kOperation type.", adj_n->Name(),
+                static_cast<int>(adj_n->NodeType())));
         VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
@@ -184,7 +190,11 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationOutAdjList(
     }
     for (auto &var : n->outputs) {
       for (auto &adj_n : var->outputs) {
-        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        PADDLE_ENFORCE_EQ(
+            adj_n->NodeType(), ir::Node::Type::kOperation,
+            platform::errors::InvalidArgument(
+                "Node(%s)'s type(%d) must be kOperation type.", adj_n->Name(),
+                static_cast<int>(adj_n->NodeType())));
         VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                  << " -> " << n->Name() << reinterpret_cast<void *>(n)
                  << "  via " << var->Name() << reinterpret_cast<void *>(var);
@@ -359,7 +369,10 @@ size_t GraphNum(const Graph &graph) {
       }
       std::unique_ptr<std::ostream> fout(
           new std::ofstream(FLAGS_print_sub_graph_dir));
-      PADDLE_ENFORCE(fout->good());
+      PADDLE_ENFORCE_EQ(fout->good(), true,
+                        platform::errors::Unavailable(
+                            "Can not open file %s for printing the graph.",
+                            FLAGS_print_sub_graph_dir));
       *fout << out.str();
     }
   }
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
index abcba32a6492b..4b403c46260c6 100644
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -37,12 +37,14 @@ NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
     : stack_(other.stack_), visited_(other.visited_) {}
 
 Node &NodesDFSIterator::operator*() {
-  PADDLE_ENFORCE(!stack_.empty());
+  PADDLE_ENFORCE_EQ(stack_.empty(), false, platform::errors::OutOfRange(
+                                               "The iterator exceeds range."));
   return *stack_.top();
 }
 
 NodesDFSIterator &NodesDFSIterator::operator++() {
-  PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range");
+  PADDLE_ENFORCE_EQ(stack_.empty(), false, platform::errors::OutOfRange(
+                                               "The iterator exceeds range."));
   visited_.insert(stack_.top());
   auto *cur = stack_.top();
   stack_.pop();
@@ -73,11 +75,18 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
 }
 
 NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
-  PADDLE_ENFORCE(!source.empty(),
-                 "Start points of topological sorting should not be empty!");
+  PADDLE_ENFORCE_EQ(
+      source.empty(), false,
+      platform::errors::InvalidArgument(
+          "Start points of topological sorting should not be empty!"));
   // CHECK all the inputs' in-degree is 0
   for (auto *node : source) {
-    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+    PADDLE_ENFORCE_EQ(
+        CheckNodeIndegreeEquals(*node, 0), true,
+        platform::errors::InvalidArgument(
+            "In start points of topological sorting, the indegree of each "
+            "point should be 0. Node(%s)'s indegree is not 0.",
+            node->Name()));
   }
 
   std::set<Node *> to_visit{source.begin(), source.end()};
@@ -106,7 +115,11 @@ NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
     : sorted_(other.sorted_), cursor_(other.cursor_) {}
 
 Node &NodesTSIterator::operator*() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  PADDLE_ENFORCE_LT(
+      cursor_, sorted_.size(),
+      platform::errors::OutOfRange(
+          "The iterator exceeds range. Container size is %d, but index is %d.",
+          sorted_.size(), cursor_));
   return *sorted_[cursor_];
 }
 
@@ -128,7 +141,11 @@ bool NodesTSIterator::operator==(const NodesTSIterator &other) {
 }
 
 Node *NodesTSIterator::operator->() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  PADDLE_ENFORCE_LT(
+      cursor_, sorted_.size(),
+      platform::errors::OutOfRange(
+          "The iterator exceeds range. Container size is %d, but index is %d.",
+          sorted_.size(), cursor_));
   return sorted_[cursor_];
 }
 
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
index f6772f9a37567..bb4212bcd33d7 100644
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -15,6 +15,8 @@
 #pragma once
 
 #include <stack>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -66,7 +68,7 @@ struct NodesDFSIterator
 struct NodesTSIterator
     : public std::iterator<std::forward_iterator_tag, Node *> {
   NodesTSIterator() = default;
-  NodesTSIterator(const std::vector<Node *> &source);
+  explicit NodesTSIterator(const std::vector<Node *> &source);
   NodesTSIterator(NodesTSIterator &&other)
       : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
     other.cursor_ = 0;
@@ -104,7 +106,10 @@ struct GraphTraits {
 
   static iterator_range<NodesTSIterator> TS(const Graph &g) {
     auto start_points = ExtractStartPoints(g);
-    PADDLE_ENFORCE(!start_points.empty());
+    PADDLE_ENFORCE_EQ(
+        start_points.empty(), false,
+        platform::errors::InvalidArgument(
+            "Start points of topological sorting should not be empty!"));
     NodesTSIterator x(start_points);
     return iterator_range<NodesTSIterator>(NodesTSIterator(start_points),
                                            NodesTSIterator());
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 7f4519ad9919d..64f5376a784c2 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -42,7 +42,10 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const {
   const std::string& graph_viz_path = Get<std::string>(kGraphvizPath);
   VLOG(3) << "draw IR graph viz to " << graph_viz_path;
   std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
-  PADDLE_ENFORCE(fout->good());
+  PADDLE_ENFORCE_EQ(
+      fout->good(), true,
+      platform::errors::Unavailable(
+          "Can not open file %s for printing the graph.", graph_viz_path));
   std::ostream& sout = *fout;
 
   std::unordered_map<const ir::Node*, std::string> node2dot;
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index a39901e63bf65..c8dfa02f469a3 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -64,7 +64,11 @@ void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
     for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) {
       auto* arguments = parameter.mutable_arguments();
       auto it = std::find(arguments->begin(), arguments->end(), scale_in_name);
-      PADDLE_ENFORCE(it != arguments->end());
+      PADDLE_ENFORCE_NE(
+          it, arguments->end(),
+          platform::errors::NotFound(
+              "Can not find input variable(%s) from scale op(%s).",
+              scale_in_name, pre_op_desc->Type()));
       *it = scale_out_name;
     }
 
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
index a0cb7e93306d2..864a0379988fa 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
@@ -33,7 +33,8 @@ const char kSumGradOpName[] = "sum";
 const char kOptimizerType[] = "sgd";
 
 void LockFreeOptimizePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
 
   // We could collect all weights' name from SGD, where
   // W1 <- SGD(W0, Grad0)
@@ -41,7 +42,10 @@ void LockFreeOptimizePass::ApplyImpl(ir::Graph* graph) const {
   for (auto* node : graph->Nodes()) {
     if (IsOpNamed(node, kOptimizerType)) {
       auto& param_out_vars = node->Op()->Output("ParamOut");
-      PADDLE_ENFORCE(param_out_vars.size() == 1u);
+      PADDLE_ENFORCE_EQ(
+          param_out_vars.size(), 1u,
+          platform::errors::InvalidArgument(
+              "In op(%s), find output(ParamOut) failed.", node->Name()));
       weight_var_set.insert(param_out_vars[0]);
     }
   }
@@ -95,12 +99,19 @@ void LockFreeOptimizePass::ApplyImpl(ir::Graph* graph) const {
 
             VLOG(3) << "Found forward_op " << forward_op->Name();
 
-            PADDLE_ENFORCE(forward_op);
+            PADDLE_ENFORCE_NOT_NULL(
+                forward_op, platform::errors::NotFound(
+                                "Can not find forward op for backword op(%s).",
+                                backward_op->Name()));
 
             Node* new_optimizer_node = CreateNewSGDNode(
                 graph, forward_op, backward_op, node, opt_node);
 
-            PADDLE_ENFORCE(new_optimizer_node);
+            PADDLE_ENFORCE_NOT_NULL(
+                new_optimizer_node,
+                platform::errors::InvalidArgument(
+                    "Create new SGD node failed, backward op is %s.",
+                    backward_op->Name()));
           }
         }
       }
@@ -144,11 +155,21 @@ void LockFreeOptimizePass::ApplyImpl(ir::Graph* graph) const {
 ir::Node* LockFreeOptimizePass::CreateNewSGDNode(
     ir::Graph* graph, ir::Node* forward_node, ir::Node* backward_node,
     ir::Node* grad_sum_node, ir::Node* optimize_node) const {
-  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(forward_node);
-  PADDLE_ENFORCE(backward_node);
-  PADDLE_ENFORCE(grad_sum_node);
-  PADDLE_ENFORCE(optimize_node);
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::InvalidArgument(
+                              "Input argument graph cannot be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      forward_node, platform::errors::InvalidArgument(
+                        "Input argument forward_node cannot be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      backward_node, platform::errors::InvalidArgument(
+                         "Input argument backward_node cannot be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      grad_sum_node, platform::errors::InvalidArgument(
+                         "Input argument grad_sum_node cannot be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      optimize_node, platform::errors::InvalidArgument(
+                         "Input argument optimize_node cannot be nullptr."));
 
   // find the grad var node between the grad sum node and backward_node
   std::vector<ir::Node*> grad_vars =
@@ -159,7 +180,8 @@ ir::Node* LockFreeOptimizePass::CreateNewSGDNode(
       grad_node = node;
     }
   }
-  PADDLE_ENFORCE(grad_node);
+  PADDLE_ENFORCE_NOT_NULL(grad_node, platform::errors::NotFound(
+                                         "Can not find control dep variable."));
 
   // create a new SGD node
   OpDesc* old_desc = optimize_node->Op();
@@ -212,8 +234,14 @@ ir::Node* LockFreeOptimizePass::CreateNewSGDNode(
   }
 
   // SGD must have only one param and LR in
-  PADDLE_ENFORCE(old_desc->Input("LearningRate").size() == 1u);
-  PADDLE_ENFORCE(old_desc->Input("Param").size() == 1u);
+  PADDLE_ENFORCE_EQ(
+      old_desc->Input("LearningRate").size(), 1u,
+      platform::errors::InvalidArgument(
+          "In op(%s), find input(LearningRate) failed.", old_desc->Type()));
+  PADDLE_ENFORCE_EQ(
+      old_desc->Input("Param").size(), 1u,
+      platform::errors::InvalidArgument("In op(%s), find input(Param) failed.",
+                                        old_desc->Type()));
 
   // LR and weight nodes should be copied
   for (Node* upstream_node : optimize_node->inputs) {
@@ -245,9 +273,17 @@ std::vector<ir::Node*> LockFreeOptimizePass::FindConnectedNode(
 void LockFreeOptimizePass::ReplaceUpstreamNode(
     ir::Node* upstream_node, ir::Node* old_optimizer_node,
     ir::Node* new_optimizer_node) const {
-  PADDLE_ENFORCE(upstream_node);
-  PADDLE_ENFORCE(old_optimizer_node);
-  PADDLE_ENFORCE(new_optimizer_node);
+  PADDLE_ENFORCE_NOT_NULL(
+      upstream_node, platform::errors::InvalidArgument(
+                         "Input argument upstream_node cannot be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      old_optimizer_node,
+      platform::errors::InvalidArgument(
+          "Input argument old_optimizer_node cannot be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      new_optimizer_node,
+      platform::errors::InvalidArgument(
+          "Input argument new_optimizer_node cannot be nullptr."));
 
   // Remove the old_optimizer_node from upstream_node's outputs vector
   auto& output_node_vec = upstream_node->outputs;
@@ -268,8 +304,14 @@ void LockFreeOptimizePass::ReplaceUpstreamNode(
 
 void LockFreeOptimizePass::ReplaceAllDownstreamNode(
     ir::Node* old_optimizer_node, ir::Node* new_optimizer_node) const {
-  PADDLE_ENFORCE(old_optimizer_node);
-  PADDLE_ENFORCE(new_optimizer_node);
+  PADDLE_ENFORCE_NOT_NULL(
+      old_optimizer_node,
+      platform::errors::InvalidArgument(
+          "Input argument old_optimizer_node cannot be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      new_optimizer_node,
+      platform::errors::InvalidArgument(
+          "Input argument new_optimizer_node cannot be nullptr."));
 
   for (ir::Node* downstream_node : old_optimizer_node->outputs) {
     // Remove the old_optimizer_node from downstream_node's inputs vector
@@ -292,8 +334,12 @@ void LockFreeOptimizePass::ReplaceAllDownstreamNode(
 
 ir::Node* LockFreeOptimizePass::FindForwardOpViaBackwardOp(
     ir::Graph* graph, ir::Node* backward_node) const {
-  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(backward_node);
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::InvalidArgument(
+                              "Input argument graph cannot be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      backward_node, platform::errors::InvalidArgument(
+                         "Input argument backward_node cannot be nullptr."));
 
   // strip the suffix _grad of backward_node's name
   std::string forward_op_name = backward_node->Name();
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index 9c923480bac26..f38f48fcd92a6 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -87,34 +87,46 @@ class LockFreeOptimizePass : public Pass {
                                            ir::Node* downstream_node) const;
 
   inline bool IsOpNamed(ir::Node* node, const std::string& name) const {
-    PADDLE_ENFORCE(node);
+    PADDLE_ENFORCE_NOT_NULL(node,
+                            platform::errors::InvalidArgument(
+                                "Input argument node cannot be nullptr."));
 
     return node->NodeType() == Node::Type::kOperation && node->Name() == name;
   }
 
   inline bool IsVarNamed(ir::Node* node, const std::string& name) const {
-    PADDLE_ENFORCE(node);
+    PADDLE_ENFORCE_NOT_NULL(node,
+                            platform::errors::InvalidArgument(
+                                "Input argument node cannot be nullptr."));
 
     return node->NodeType() == Node::Type::kVariable && node->Name() == name;
   }
 
   inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const {
-    PADDLE_ENFORCE(node);
+    PADDLE_ENFORCE_NOT_NULL(node,
+                            platform::errors::InvalidArgument(
+                                "Input argument node cannot be nullptr."));
 
     return node->NodeType() == Node::Type::kVariable &&
            boost::algorithm::ends_with(node->Name(), name);
   }
 
   inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
-    PADDLE_ENFORCE(node);
+    PADDLE_ENFORCE_NOT_NULL(node,
+                            platform::errors::InvalidArgument(
+                                "Input argument node cannot be nullptr."));
 
     return node->NodeType() == Node::Type::kVariable &&
            node->Name().find(name) != std::string::npos;
   }
 
   inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const {
-    PADDLE_ENFORCE(ctrl_dep_node);
-    PADDLE_ENFORCE(node);
+    PADDLE_ENFORCE_NOT_NULL(
+        ctrl_dep_node, platform::errors::InvalidArgument(
+                           "Input argument ctrl_dep_node cannot be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(node,
+                            platform::errors::InvalidArgument(
+                                "Input argument node cannot be nullptr."));
 
     return IsControlDepVar(*ctrl_dep_node) &&
            ctrl_dep_node->inputs.size() >= 1u &&
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
index 6ce14203629e0..b1afa47910fad 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
@@ -116,7 +116,10 @@ std::vector<OpHandleBase *> BufferSharedCrossOpMemoryReusePass::SortOp(
   graph_view.BreadthFirstVisit(
       [&](OpHandleBase *cur_op) { sorted_ops.emplace_back(cur_op); });
   PADDLE_ENFORCE_EQ(sorted_ops.size(), graph_view.OpNumber(),
-                    "There are unvisited ops");
+                    platform::errors::InvalidArgument(
+                        "Sorted ops size(%d) not equal to graph op size(%d). "
+                        "There are unvisited ops.",
+                        sorted_ops.size(), graph_view.OpNumber()));
   return sorted_ops;
 }
 
@@ -181,7 +184,9 @@ void BufferSharedCrossOpMemoryReusePass::RunOnScopeIdx(size_t idx) const {
       auto *out_node = *(out_nodes.begin());
       auto *out_var =
           dynamic_cast<VarHandle *>(&(out_node->Wrapper<VarHandleBase>()));
-      PADDLE_ENFORCE_NOT_NULL(out_var);
+      PADDLE_ENFORCE_NOT_NULL(
+          out_var, platform::errors::NotFound(
+                       "Can not find a valid Var Node for Var %s.", out_arg));
 
       // If out_arg is not reusable, skip it
       if (!IsOutVarReusable(*out_var)) {
@@ -269,7 +274,8 @@ size_t BufferSharedCrossOpMemoryReusePass::ResolveDependencyBetween(
     auto op_dep = GetOpDep(prev_op, op);
     if (op_dep == NodeDependency::kBefore) continue;
     PADDLE_ENFORCE_EQ(op_dep, NodeDependency::kNoDep,
-                      "The graph has circle, this may be a bug");
+                      platform::errors::InvalidArgument(
+                          "The graph has circle, this may be a bug."));
 
     auto iter =
         std::find_if(prev_op->Outputs().begin(), prev_op->Outputs().end(),
@@ -316,9 +322,13 @@ size_t BufferSharedCrossOpMemoryReusePass::ResolveDependencyBetween(
 }
 
 void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
-  PADDLE_ENFORCE(ops_.empty(), "ops_ must be initialized here");
-  PADDLE_ENFORCE(op_to_idx_.empty(), "op_to_idx_ must be initialized here");
-  PADDLE_ENFORCE(deps_.empty(), "deps_ must be initialized here");
+  PADDLE_ENFORCE_EQ(ops_.empty(), true, platform::errors::InvalidArgument(
+                                            "Ops must be initialized here."));
+  PADDLE_ENFORCE_EQ(
+      op_to_idx_.empty(), true,
+      platform::errors::InvalidArgument("Op to idx must be initialized here."));
+  PADDLE_ENFORCE_EQ(deps_.empty(), true, platform::errors::InvalidArgument(
+                                             "Deps must be initialized here."));
 
   // Toposort ops
   OpGraphView graph_view(ir::FilterByNodeWrapper<OpHandleBase>(*graph_));
@@ -344,7 +354,10 @@ void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
                                          prev_preceding_ops.end());
     }
   });
-  PADDLE_ENFORCE_EQ(preceding_ops.size(), op_num);
+  PADDLE_ENFORCE_EQ(preceding_ops.size(), op_num,
+                    platform::errors::InvalidArgument(
+                        "Preceding ops size(%d) must equal to op num(%d).",
+                        preceding_ops.size(), op_num));
 
   // Find out ComputationOpHandles only
   ops_.resize(scope_num);
@@ -384,28 +397,43 @@ void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
 size_t BufferSharedCrossOpMemoryReusePass::OpIndex(
     const ComputationOpHandle *op) const {
   auto iter = op_to_idx_[op->GetScopeIdx()].find(op);
-  PADDLE_ENFORCE(iter != op_to_idx_[op->GetScopeIdx()].end());
+  PADDLE_ENFORCE_NE(iter, op_to_idx_[op->GetScopeIdx()].end(),
+                    platform::errors::NotFound(
+                        "Can not find op(%s) in op_to_idx_.", op->Name()));
   return iter->second;
 }
 
 NodeDependency BufferSharedCrossOpMemoryReusePass::GetOpDep(
     const ComputationOpHandle *op1, const ComputationOpHandle *op2) const {
-  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx());
+  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx(),
+                    platform::errors::InvalidArgument(
+                        "Op(%s) and op(%s) must in the same scope.",
+                        op1->Name(), op2->Name()));
   return deps_[op1->GetScopeIdx()][OpIndex(op1)][OpIndex(op2)];
 }
 
 void BufferSharedCrossOpMemoryReusePass::SetOpDep(
     const ComputationOpHandle *op1, const ComputationOpHandle *op2,
     NodeDependency dep) const {
-  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx());
+  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx(),
+                    platform::errors::InvalidArgument(
+                        "Op(%s) and op(%s) must in the same scope.",
+                        op1->Name(), op2->Name()));
   if (op1 == op2) {
-    PADDLE_ENFORCE(dep == NodeDependency::kSame);
+    PADDLE_ENFORCE_EQ(
+        dep, NodeDependency::kSame,
+        platform::errors::InvalidArgument(
+            "Set Same Op(%s) Dep, dep must be kSame type.", op1->Name()));
     auto idx = OpIndex(op1);
     deps_[op1->GetScopeIdx()][idx][idx] = NodeDependency::kSame;
   } else {
     auto idx1 = OpIndex(op1);
     auto idx2 = OpIndex(op2);
-    PADDLE_ENFORCE(dep != NodeDependency::kSame && idx1 != idx2);
+    PADDLE_ENFORCE_EQ((dep != NodeDependency::kSame && idx1 != idx2), true,
+                      platform::errors::InvalidArgument(
+                          "Op(%s) and Op(%s) should not have same "
+                          "index(%d), and dep should not kSame type.",
+                          op1->Name(), op2->Name(), idx1));
     deps_[op1->GetScopeIdx()][idx1][idx2] = dep;
     deps_[op1->GetScopeIdx()][idx2][idx1] = ReverseNodeDependency(dep);
   }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index 338a608b4ae3d..0b42f2ebd5555 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -57,7 +57,9 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
       auto *op = *(pair.second.ops().begin());
       const std::string &op_type = op->GetOp()->Type();
       const framework::OpDesc *op_desc = op->Node()->Op();
-      PADDLE_ENFORCE_NOT_NULL(op_desc);
+      PADDLE_ENFORCE_NOT_NULL(
+          op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.",
+                                              op->Name()));
 
       auto &infer_inplace = OpInfoMap::Instance().Get(op_type).infer_inplace_;
       if (!infer_inplace) {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
index 9a322bdc1dce1..7b9b5aa623074 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@@ -58,8 +58,12 @@ static int64_t GetMemorySize(
         &vars,
     const std::string &var_name) {
   auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
-  PADDLE_ENFORCE_NOT_NULL(var_desc);
-  PADDLE_ENFORCE(IsLoDTensor(var_desc));
+  PADDLE_ENFORCE_NOT_NULL(
+      var_desc,
+      platform::errors::NotFound("Var(%s) can not find VarDesc.", var_name));
+  PADDLE_ENFORCE_EQ(IsLoDTensor(var_desc), true,
+                    platform::errors::InvalidArgument(
+                        "Var(%s) must be LoDTensor.", var_name));
   auto dims = var_desc->GetShape();
   return SizeOfType(var_desc->GetDataType()) *
          std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
index 4f6bacecab4aa..94842485440bd 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
@@ -42,8 +42,10 @@ class MemOptVarInfo {
   }
 
   void SetRefCnt(size_t ref_cnt) {
-    PADDLE_ENFORCE_GE(ref_cnt, 1,
-                      "Reference count must be larger than or equal to 1");
+    PADDLE_ENFORCE_GE(
+        ref_cnt, 1,
+        platform::errors::InvalidArgument(
+            "Reference count(%d) must be larger than or equal to 1.", ref_cnt));
     ref_cnt_ = ref_cnt;
     runtime_ref_cnt_ = ref_cnt;
   }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index 20c7968d6ac56..221b0a76e7ef5 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -66,7 +66,11 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
                                   details::VarHandle *out_var) const {
   auto *op =
       dynamic_cast<details::ComputationOpHandle *>(out_var->GeneratedOp());
-  PADDLE_ENFORCE_NOT_NULL(op);
+  PADDLE_ENFORCE_NOT_NULL(
+      op,
+      platform::errors::InvalidArgument(
+          "Var(%s) have no GeneratedOp, or it's op is not ComputationOpHandle.",
+          out_var->Name()));
   if (IsVarPairReusable(*in_var, *out_var)) {
     AddReuseVar(op, in_var, out_var);
     return true;
@@ -91,10 +95,13 @@ VarDesc *MemoryReusePass::GetVarDesc(const details::VarHandle &var) const {
   size_t scope_idx = var.scope_idx();
   auto iter = var_descs_[scope_idx].find(var_name);
   if (iter == var_descs_[scope_idx].end()) {
-    PADDLE_ENFORCE((*all_vars_)[scope_idx].count(var_name),
-                   "Variable %s not found", var_name);
+    PADDLE_ENFORCE_NE(
+        (*all_vars_)[scope_idx].count(var_name), 0,
+        platform::errors::NotFound("Variable %s not found.", var_name));
     auto *desc = TryGetLatestVarDesc((*all_vars_)[scope_idx].at(var_name));
-    PADDLE_ENFORCE_NOT_NULL(desc);
+    PADDLE_ENFORCE_NOT_NULL(
+        desc,
+        platform::errors::NotFound("Var(%s) can not find VarDesc.", var_name));
     var_descs_[scope_idx].emplace(var_name, desc);
     return desc;
   } else {
@@ -119,7 +126,9 @@ void MemoryReusePass::CollectShareTensorBufferOpHandles() const {
     if (share_buffer_op != nullptr) {
       auto *compute_op =
           details::GetUniquePendingComputationOpHandle(share_buffer_op);
-      PADDLE_ENFORCE(ops_.count(compute_op) == 0);
+      PADDLE_ENFORCE_EQ(
+          ops_.count(compute_op), 0,
+          platform::errors::AlreadyExists("Compute op already exists."));
       ops_.emplace(compute_op, share_buffer_op);
     }
   }
@@ -227,8 +236,11 @@ bool MemoryReusePass::IsInVarReusable(const details::VarHandle &in_var) const {
  */
 bool MemoryReusePass::IsOutVarReusable(
     const details::VarHandle &out_var) const {
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<const details::ComputationOpHandle *>(
-      out_var.GeneratedOp()));
+  PADDLE_ENFORCE_NOT_NULL(
+      dynamic_cast<const details::ComputationOpHandle *>(out_var.GeneratedOp()),
+      platform::errors::InvalidArgument(
+          "Var(%s) have no GeneratedOp, or it's op is not ComputationOpHandle.",
+          out_var.Name()));
   const auto out_name = out_var.Name();
   if (out_name == kEmptyVarName) {
     return false;
@@ -236,9 +248,10 @@ bool MemoryReusePass::IsOutVarReusable(
 
   // out_var must be the first version!!!
   auto out_var_iter = (*all_vars_)[out_var.scope_idx()].find(out_name);
-  PADDLE_ENFORCE(out_var_iter != (*all_vars_)[out_var.scope_idx()].end() &&
-                     !out_var_iter->second.empty(),
-                 "Cannot find variable %s", out_name);
+  PADDLE_ENFORCE_EQ(
+      (out_var_iter != (*all_vars_)[out_var.scope_idx()].end() &&
+       !out_var_iter->second.empty()),
+      true, platform::errors::NotFound("Cannot find variable %s.", out_name));
 
   if (out_var_iter->second[0] != &out_var) {
     return false;
@@ -282,7 +295,11 @@ bool MemoryReusePass::IsVarPairReusable(
     const details::VarHandle &in_var, const details::VarHandle &out_var) const {
   auto *op =
       dynamic_cast<const details::ComputationOpHandle *>(out_var.GeneratedOp());
-  PADDLE_ENFORCE_NOT_NULL(op);
+  PADDLE_ENFORCE_NOT_NULL(
+      op,
+      platform::errors::InvalidArgument(
+          "Var(%s) have no GeneratedOp, or it's op is not ComputationOpHandle.",
+          out_var.Name()));
 
   const auto in_name = in_var.Name();
   if (in_name == out_var.Name()) {
@@ -308,8 +325,10 @@ bool MemoryReusePass::IsVarPairReusable(
 void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
                                   details::VarHandle *in_var,
                                   details::VarHandle *out_var) const {
-  PADDLE_ENFORCE((*var_infos_)[op->GetScopeIdx()].count(in_var->Name()) > 0,
-                 "%s does not in mem-opt var infos", in_var->Name());
+  PADDLE_ENFORCE_GT(
+      (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
+      platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
+                                 in_var->Name()));
 
   if (ops_.count(op) == 0) {
     InsertShareTensorBufferOpHandleToGraph(op);
@@ -349,7 +368,10 @@ void MemoryReusePass::UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
   if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
     last_live_op_of_in_var = op;
   } else {
-    PADDLE_ENFORCE(!out_var_op_iter->second.ops().empty());
+    PADDLE_ENFORCE_EQ(
+        out_var_op_iter->second.ops().empty(), false,
+        platform::errors::InvalidArgument(
+            "Var(%s)'s last live op should not empty.", out_var->Name()));
     last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
   }
 
@@ -359,8 +381,9 @@ void MemoryReusePass::UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
   last_live_ops_of_in_var->insert(last_live_op_of_in_var);
 
   auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
-  PADDLE_ENFORCE(in_var_info_iter != (*var_infos_)[scope_idx].end(),
-                 "Cannot find variable %s", in_var->Name());
+  PADDLE_ENFORCE_NE(
+      in_var_info_iter, (*var_infos_)[scope_idx].end(),
+      platform::errors::NotFound("Cannot find variable %s.", in_var->Name()));
 
   in_var_info_iter->second->SetRefCnt(1);
 }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
index d2cc89a2b49d8..11c2508afb574 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
@@ -39,7 +39,7 @@ void OpGraphView::Build(const std::vector<details::OpHandleBase *> &ops) {
   }
   PADDLE_ENFORCE(
       preceding_ops_.size() == ops.size() && pending_ops_.size() == ops.size(),
-      "There are duplicate ops in graph.");
+      platform::errors::InvalidArgument("There are duplicate ops in graph."));
 }
 
 std::unordered_set<details::OpHandleBase *> OpGraphView::AllOps() const {
@@ -56,8 +56,10 @@ bool OpGraphView::HasOp(details::OpHandleBase *op) const {
 }
 
 void OpGraphView::EnforceHasOp(details::OpHandleBase *op) const {
-  PADDLE_ENFORCE(HasOp(op), "Cannot find op %s in OpGraphView",
-                 op == nullptr ? "nullptr" : op->DebugString());
+  PADDLE_ENFORCE_EQ(HasOp(op), true,
+                    platform::errors::NotFound(
+                        "Cannot find op %s in OpGraphView.",
+                        op == nullptr ? "nullptr" : op->DebugString()));
 }
 
 const std::unordered_set<details::OpHandleBase *> &OpGraphView::PendingOps(
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
index 86b25c13959a7..5fb2caedba85d 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
@@ -127,9 +127,13 @@ void OpGraphView::BreadthFirstVisit(Callback &&callback) const {
     }
   }
 
-  PADDLE_ENFORCE_EQ(num_calls, op_num, "There are unvisited ops");
-  PADDLE_ENFORCE_EQ(visited_ops.size(), op_num, "There are unvisited ops");
-  PADDLE_ENFORCE(op_deps.empty(), "There are unvisited ops");
+  PADDLE_ENFORCE_EQ(num_calls, op_num, platform::errors::InvalidArgument(
+                                           "There are unvisited ops."));
+  PADDLE_ENFORCE_EQ(
+      visited_ops.size(), op_num,
+      platform::errors::InvalidArgument("There are unvisited ops."));
+  PADDLE_ENFORCE_EQ(op_deps.empty(), true, platform::errors::InvalidArgument(
+                                               "There are unvisited ops."));
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
index 4584b3d4e0f07..88d1b2aa003ce 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
@@ -77,11 +77,15 @@ class ShrinkDepsOpFunctor {
       const std::vector<details::OpHandleBase *> &ops) const {
     std::unordered_map<details::OpHandleBase *, size_t> op_to_idx;
     for (size_t i = 0; i < ops.size(); ++i) {
-      PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
+      PADDLE_ENFORCE_EQ(
+          graph_.HasOp(ops[i]), true,
+          platform::errors::InvalidArgument("Op does not exist in graph."));
       op_to_idx[ops[i]] = i;
     }
 
-    PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops");
+    PADDLE_ENFORCE_EQ(
+        op_to_idx.size(), ops.size(),
+        platform::errors::InvalidArgument("Graph may have duplicate ops."));
 
     std::vector<std::vector<RelationShip>> ret(ops.size());
     for (auto &e : ret) {
@@ -247,9 +251,9 @@ ExtractComputationOpFromLastLivedVar(details::VarHandle *var, size_t scope_idx,
     return {};
   }
 
-  PADDLE_ENFORCE_EQ(
-      computation_ops.empty(), false,
-      platform::errors::InvalidArgument("Computation ops should not be empty"));
+  PADDLE_ENFORCE_EQ(computation_ops.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "Computation ops should not be empty."));
 
   // stage four. Try to shrink computation op if they depend on each other.
   // Get the smallest set of the most ops.
@@ -263,8 +267,9 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
       Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
 
   PADDLE_ENFORCE(last_live_ops_of_vars.empty() && var_infos.empty(),
-                 "Last Live Ops and Reference Counts of vars should be "
-                 "initialized at here.");
+                 platform::errors::InvalidArgument(
+                     "Last live ops and reference counts of vars should be "
+                     "initialized at here."));
 
   const auto &vars = graph->Get<details::GraphVars>(details::kGraphVars);
 
@@ -304,11 +309,15 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
       auto &var_name = name_var_pair.first;
       auto &var_handles = name_var_pair.second;
 
-      PADDLE_ENFORCE_EQ(var_desc->Name(), var_name);
-
       PADDLE_ENFORCE_EQ(
-          var_handles.empty(), false,
-          platform::errors::InvalidArgument("Variable %s not found", var_name));
+          var_desc->Name(), var_name,
+          platform::errors::InvalidArgument(
+              "A Var, it's VarName(%s) and DescName(%s) not same.", var_name,
+              var_desc->Name()));
+
+      PADDLE_ENFORCE_EQ(var_handles.empty(), false,
+                        platform::errors::InvalidArgument(
+                            "Variable %s not found.", var_name));
       auto last_ver_var = var_handles.back();
 
       if (last_ver_var->Node()->IsCtrlVar()) {
@@ -327,12 +336,13 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
         continue;
       }
 
+      PADDLE_ENFORCE_EQ(status, LastLiveOpSearchStatus::kSuccess,
+                        platform::errors::InvalidArgument(
+                            "Status(%d) must be success.", status));
       PADDLE_ENFORCE_EQ(
-          status, LastLiveOpSearchStatus::kSuccess,
-          platform::errors::InvalidArgument("status must be success"));
-      PADDLE_ENFORCE_EQ(result.empty(), false,
-                        platform::errors::NotFound(
-                            "Last living ops of %s cannot be empty", var_name));
+          result.empty(), false,
+          platform::errors::NotFound("Last living ops of %s cannot be empty.",
+                                     var_name));
 
       std::string last_live_ops_log_str;
       for (auto &each_ret : result) {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index 119917428997b..45ff275d53085 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -22,7 +22,8 @@ namespace framework {
 namespace ir {
 
 void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph, "graph cannot be nullptr.");
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("conv_activation_mkldnn_fuse", graph);
 
   GraphPatternDetector gpd;
@@ -75,7 +76,8 @@ void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const {
     GraphSafeRemoveNodes(graph, {activation, conv_out});
 
     PADDLE_ENFORCE_GT(subgraph.count(conv_input), 0UL,
-                      "subgraph has to contain conv_input node.");
+                      platform::errors::InvalidArgument(
+                          "Subgraph has to contain conv input node."));
     IR_NODE_LINK_TO(conv, activation_out);
     found_conv_activation_count++;
   };
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index bbfc8c005580b..82e0af3c19875 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -26,7 +26,11 @@ namespace ir {
 template <typename BinaryOperation>
 LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
                                BinaryOperation f) {
-  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
+  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims(),
+                    platform::errors::InvalidArgument(
+                        "Input two tensors must have same shape, but they are "
+                        "different: %s, %s.",
+                        vec_a.dims(), vec_b.dims()));
   LoDTensor vec_y;
   vec_y.Resize(vec_a.dims());
   const float* a = vec_a.data<float>();
@@ -39,11 +43,13 @@ LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
 }
 
 void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
@@ -68,7 +74,9 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
     // elementwise_add op
     GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
 
-    PADDLE_ENFORCE(subgraph.count(conv_input));
+    PADDLE_ENFORCE_NE(
+        subgraph.count(conv_input), 0,
+        platform::errors::NotFound("Detector did not find conv input."));
 
     // check if fuse can be done and if MKL-DNN should be used
     FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
@@ -86,10 +94,16 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
     if (has_bias && conv->Op()->Input("Bias").size() > 0) {
       auto conv_bias_names = conv->Op()->Input("Bias");
       // add eltwise bias to existing conv bias
-      PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+      PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1,
+                        platform::errors::NotFound("Can not find var Bias."));
       auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
       auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
-      PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), eltwise_bias_tensor->dims());
+      PADDLE_ENFORCE_EQ(
+          conv_bias_tensor->dims(), eltwise_bias_tensor->dims(),
+          platform::errors::InvalidArgument(
+              "Conv bias tensor and eltwise bias tensor "
+              "must have same shape, but they are different: %s, %s.",
+              conv_bias_tensor->dims(), eltwise_bias_tensor->dims()));
       *conv_bias_tensor = tensor_apply_eltwise(
           *conv_bias_tensor, *eltwise_bias_tensor, std::plus<float>());
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
index 9e8f0f0c46cee..af64cb22054e9 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -39,7 +39,10 @@ void ConvConcatReLUFusePass::FindConcatWithConvs(
 
     for (auto node : concat_inputs) {
       auto prev_op_node = node->inputs;
-      PADDLE_ENFORCE_EQ(prev_op_node.size(), 1);
+      PADDLE_ENFORCE_EQ(prev_op_node.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "Node(%s) input size(%d) must be 1.", node->Name(),
+                            prev_op_node.size()));
       auto* conv_op = prev_op_node[0];
       if (conv_op->Op()->Type() != "conv2d") return;
 
@@ -103,7 +106,8 @@ void ConvConcatReLUFusePass::FuseConvConcatReLU(
 }
 
 void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   std::unordered_map<const Node*, int> concat_with_convs_counter;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 9881f7f9e56fd..23419d5b9e0a2 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -68,10 +68,10 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
   auto inputs = op->Op()->InputNames();
   bool name_found =
       std::find(inputs.begin(), inputs.end(), input_name) != inputs.end();
-  PADDLE_ENFORCE_EQ(
-      name_found, true,
-      platform::errors::InvalidArgument("%s isn't the input of the %s operator",
-                                        input_name, op->Op()->Type()));
+  PADDLE_ENFORCE_EQ(name_found, true,
+                    platform::errors::InvalidArgument(
+                        "Var(%s) isn't the input of the %s operator.",
+                        input_name, op->Op()->Type()));
   unsigned max = is_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_to_one * max;
 
@@ -110,8 +110,14 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
                                      std::string scale_attr_name) const {
   auto inputs = op->inputs;
   auto output = op->outputs[0];
-  PADDLE_ENFORCE_GE(inputs.size(), 1);
-  PADDLE_ENFORCE_EQ(op->outputs.size(), 1);
+  PADDLE_ENFORCE_GE(inputs.size(), 1,
+                    platform::errors::InvalidArgument(
+                        "OP(%s)'s inputs(%d) must be equal or greater than 1.",
+                        op->Name(), inputs.size()));
+  PADDLE_ENFORCE_EQ(op->outputs.size(), 1,
+                    platform::errors::InvalidArgument(
+                        "OP(%s)'s outputs(%d) must be equal to 1.", op->Name(),
+                        op->outputs.size()));
 
   // create a quantize op desc prototype
   OpDesc q_desc;
@@ -159,8 +165,8 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
       std::find(outputs.begin(), outputs.end(), output_name) != outputs.end();
   PADDLE_ENFORCE_EQ(name_found, true,
                     platform::errors::InvalidArgument(
-                        "%s isn't the output of the %s operator", output_name,
-                        op->Op()->Type()));
+                        "Var(%s) isn't the output of the %s operator.",
+                        output_name, op->Op()->Type()));
   unsigned max = is_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_to_one * max;
 
@@ -682,10 +688,12 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
     bool is_x_unsigned{false}, is_y_unsigned{false};
     auto input_x_scale = GetScaleValueForNode(matmul_in_x, &is_x_unsigned);
     auto input_y_scale = GetScaleValueForNode(matmul_in_y, &is_y_unsigned);
-    PADDLE_ENFORCE_EQ(
-        is_x_unsigned, is_y_unsigned,
-        platform::errors::InvalidArgument(
-            "Matmul inputs should have the same value of is_unsigned"));
+    PADDLE_ENFORCE_EQ(is_x_unsigned, is_y_unsigned,
+                      platform::errors::InvalidArgument(
+                          "Matmul inputs should have the same "
+                          "attribute of signed/unsigned, but they "
+                          "are different: x(%d), y(%d).",
+                          is_x_unsigned, is_y_unsigned));
     QuantizeInput(g, matmul_op, matmul_in_x, "X", input_x_scale, is_x_unsigned,
                   "Scale_x");
     QuantizeInput(g, matmul_op, matmul_in_y, "Y", input_y_scale, is_y_unsigned,
@@ -785,10 +793,12 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
 
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Quantizing the graph.";
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
-  PADDLE_ENFORCE(param_scope());
+  PADDLE_ENFORCE_NOT_NULL(param_scope(), platform::errors::InvalidArgument(
+                                             "Scope cannot be nullptr."));
 
   QuantizeConv(graph, false /* with_residual_data */);
   QuantizeConv(graph, true /* with_residual_data */);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 130ba44ff64c7..bc24c10d9d0ae 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -75,7 +75,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
         BOOST_GET_CONST(float, quant_op->Op()->GetAttr("Scale"));
     PADDLE_ENFORCE_NE(
         nodes_keep_counter->find(dequant_out), nodes_keep_counter->end(),
-        platform::errors::NotFound("The dequant output node is not found"));
+        platform::errors::NotFound("The dequant output node is not found."));
 
     // check if dequantize op should be kept or removed, decrease the counter
     bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
@@ -153,8 +153,9 @@ void CPUQuantizeSquashPass::OpRequantSquash(Graph* graph) const {
 
       PADDLE_ENFORCE_NE(
           any_op_output_name.empty(), true,
-          platform::errors::NotFound("Operator before requantize operator "
-                                     "should have requantize input as output"));
+          platform::errors::NotFound("Operator before requantize operator(%s) "
+                                     "should have requantize input as output.",
+                                     requant_in->Name()));
 
       float requant_scale_out =
           BOOST_GET_CONST(float, requant_op->Op()->GetAttr("Scale_out"));
@@ -195,10 +196,11 @@ void CPUQuantizeSquashPass::RequantOpSquash(Graph* graph) const {
         for (auto input_name : any_op->Op()->Input(name))
           if (input_name == requant_out->Name()) any_op_input_name = name;
 
-      PADDLE_ENFORCE_NE(
-          any_op_input_name.empty(), true,
-          platform::errors::NotFound("The operator after requantize operator "
-                                     "should have requantize output as input"));
+      PADDLE_ENFORCE_NE(any_op_input_name.empty(), true,
+                        platform::errors::NotFound(
+                            "The operator after requantize operator(%s) "
+                            "should have requantize output as input.",
+                            requant_out->Name()));
       float requant_scale_in =
           boost::get<float>(requant_op->Op()->GetAttr("Scale_in"));
 
@@ -206,11 +208,14 @@ void CPUQuantizeSquashPass::RequantOpSquash(Graph* graph) const {
       if (any_op->Op()->Type() == "matmul")
         scale_name = any_op_input_name == "X" ? "Scale_x" : "Scale_y";
 
-      PADDLE_ENFORCE_EQ(requant_op->Op()->GetAttrIfExists<float>("Scale_out"),
-                        any_op->Op()->GetAttrIfExists<float>(scale_name),
-                        platform::errors::InvalidArgument(
-                            "The operator after requantize should have input "
-                            "scale equal to requantize output scale"));
+      PADDLE_ENFORCE_EQ(
+          requant_op->Op()->GetAttrIfExists<float>("Scale_out"),
+          any_op->Op()->GetAttrIfExists<float>(scale_name),
+          platform::errors::InvalidArgument(
+              "The operator after requantize should have input "
+              "scale(%f) equal to requantize output scale(%f).",
+              any_op->Op()->GetAttrIfExists<float>(scale_name),
+              requant_op->Op()->GetAttrIfExists<float>("Scale_out")));
       any_op->Op()->SetAttr(scale_name, requant_scale_in);
       any_op->Op()->SetInput(any_op_input_name,
                              std::vector<std::string>({requant_in->Name()}));
@@ -286,8 +291,9 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
     auto* first_quant_out = first_quant_op->outputs[0];
     float scale = first_quant_op->Op()->GetAttrIfExists<float>("Scale");
 
-    PADDLE_ENFORCE_NE(scale, 0, platform::errors::InvalidArgument(
-                                    "Quantize scale should not be equal 0"));
+    PADDLE_ENFORCE_NE(scale, 0,
+                      platform::errors::InvalidArgument(
+                          "Quantize scale(%f) should not be equal 0.", scale));
 
     for (int iter = prev_out->outputs.size() - 1; iter >= 0; iter--) {
       auto quant_op = prev_out->outputs[iter];
@@ -304,8 +310,9 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
 
         PADDLE_ENFORCE_NE(
             last_op_input_name.empty(), true,
-            platform::errors::NotFound("Operator after quantize operator "
-                                       "should has quantize output as input"));
+            platform::errors::NotFound("Operator after quantize operator(%s) "
+                                       "should has quantize output as input.",
+                                       quant_out->Name()));
         last_op->Op()->SetInput(
             last_op_input_name,
             std::vector<std::string>({first_quant_out->Name()}));
@@ -345,10 +352,12 @@ void CPUQuantizeSquashPass::DequantScaleSquash(Graph* graph) const {
 
       PADDLE_ENFORCE_GT(dequant_scale, 0.0f,
                         platform::errors::InvalidArgument(
-                            "Dequantize scale should have positive value"));
+                            "Dequantize scale(%f) should have positive value.",
+                            dequant_scale));
       PADDLE_ENFORCE_GT(scale_scale, 0.0f,
                         platform::errors::InvalidArgument(
-                            "Scale of scale op should have positive value"));
+                            "Scale(%f) of scale op should have positive value.",
+                            scale_scale));
 
       dequant_op->Op()->SetAttr("Scale", dequant_scale / scale_scale);
       dequant_op->Op()->SetOutput(
@@ -367,8 +376,8 @@ void CPUQuantizeSquashPass::DequantScaleSquash(Graph* graph) const {
 void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph,
-      platform::errors::NotFound(
-          "The graph in function CPUQuantizeSquashPass::ApplyImpl is null"));
+      platform::errors::InvalidArgument(
+          "The graph in function CPUQuantizeSquashPass::ApplyImpl is null."));
   FusePassBase::Init("cpu_quantize_squash_pass", graph);
 
   std::unordered_map<const Node*, int> nodes_keep_counter;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 9b827fdf6fef1..37af0274ea8a2 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -57,7 +57,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     PADDLE_ENFORCE_EQ(inputs.size(), 2UL,
                       platform::errors::InvalidArgument(
                           "The fc inputs should contain input and weights, but "
-                          "now the size of inputs is %d",
+                          "now the size of inputs is %d.",
                           inputs.size()));
     op->SetInput("W", {inputs[1]});
     op->SetOutput("Out", outputs);
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index e854559ae7a87..c5965701a53d4 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -19,14 +19,17 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-#define GET_NODE(id, pattern)                               \
-  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
-                 "pattern has no Node called %s", #id);     \
-  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
-  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+#define GET_NODE(id, pattern)                                     \
+  PADDLE_ENFORCE_NE(subgraph.count(pattern.RetrieveNode(#id)), 0, \
+                    platform::errors::InvalidArgument(            \
+                        "Pattern has no Node called %s.", #id));  \
+  auto* id = subgraph.at(pattern.RetrieveNode(#id));              \
+  PADDLE_ENFORCE_NOT_NULL(                                        \
+      id, platform::errors::InvalidArgument("Subgraph has no node %s.", #id));
 
 void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("depthwise_conv_mkldnn_pass", graph);
   GraphPatternDetector gpd;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
index 0d720e828b6d0..6c87e437caa1b 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
@@ -46,12 +46,15 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
     if (scale_op->Op()->GetAttrIfExists<float>("bias") == 0.0) {
       auto matmul_alpha = matmul_op->Op()->GetAttrIfExists<float>("alpha");
       auto scale_scale = scale_op->Op()->GetAttrIfExists<float>("scale");
-      PADDLE_ENFORCE_GT(matmul_alpha, 0.0f,
-                        platform::errors::InvalidArgument(
-                            "Alpha of matmul op should have positive value"));
+      PADDLE_ENFORCE_GT(
+          matmul_alpha, 0.0f,
+          platform::errors::InvalidArgument(
+              "Alpha(%f) of matmul op should have positive value.",
+              matmul_alpha));
       PADDLE_ENFORCE_GT(scale_scale, 0.0f,
                         platform::errors::InvalidArgument(
-                            "Scale of scale op should have positive value"));
+                            "Scale(%f) of scale op should have positive value.",
+                            scale_scale));
 
       std::string matmul_op_input_name;
       for (auto name : matmul_op->Op()->InputNames())
@@ -60,8 +63,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
       PADDLE_ENFORCE_NE(
           matmul_op_input_name.empty(), true,
-          platform::errors::NotFound("Operator after scale operator "
-                                     "should have scale output as input"));
+          platform::errors::NotFound("Operator after scale operator(%s) "
+                                     "should have scale output as input.",
+                                     scale_out->Name()));
       matmul_op->Op()->SetAttr("alpha", matmul_alpha * scale_scale);
       matmul_op->Op()->SetInput(matmul_op_input_name,
                                 std::vector<std::string>({scale_in->Name()}));
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index d67f2274ebf1f..456e642ad86ab 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -85,7 +85,9 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
   // 1. record op nodes of different roles
   for (auto node : nodes) {
     if (!node->IsOp()) continue;
-    PADDLE_ENFORCE(node->Op(), "must find opdesc");
+    PADDLE_ENFORCE_NOT_NULL(
+        node->Op(), platform::errors::InvalidArgument(
+                        "Node(%s) must hold op description.", node->Name()));
     int op_role = BOOST_GET_CONST(
         int, node->Op()->GetAttr(
                  framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
@@ -108,7 +110,9 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
     } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
       lr_ops.push_back(node);
     } else {  // NOLINT
-      PADDLE_THROW("Invalid op_role: %d", static_cast<int>(op_role));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid op role(%d), in node(%s).", static_cast<int>(op_role),
+          node->Name()));
     }
   }
 
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
index 8923dfc3232fb..6d5e4ac27bf8a 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
@@ -45,7 +45,9 @@ class AllReduceDepsPass : public ir::Pass {
     for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {
       auto op_handle =
           dynamic_cast<details::NCCLOpHandleBase*>(all_reduce_op_handles[i]);
-      PADDLE_ENFORCE(op_handle, "op_handle must be NCCLOpHandleBase");
+      PADDLE_ENFORCE_NOT_NULL(op_handle,
+                              platform::errors::InvalidArgument(
+                                  "Op handle must be NCCLOpHandleBase."));
       op_handle->SetRunEnv(i, use_hierarchical_allreduce);
     }
 #endif
@@ -95,7 +97,9 @@ class AllReduceDepsPass : public ir::Pass {
         }
       }
 
-      PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
+      PADDLE_ENFORCE_NE(
+          next_ready_ops.size(), 0,
+          platform::errors::InvalidArgument("There may be a cycle."));
       ready_ops.clear();
       std::swap(ready_ops, next_ready_ops);
       GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
@@ -122,18 +126,25 @@ class AllReduceDepsPass : public ir::Pass {
     // NOTE(zcd): For distributed training, it is important to keep the order of
     // allReduce on each node consistent. Otherwise, hang may occur.
     // Sort the current_all_reduce_op_handles according to the name of input.
-    sort(current_all_reduce_op_handles.begin(),
-         current_all_reduce_op_handles.end(),
-         [](const details::OpHandleBase* left,
-            const details::OpHandleBase* right) -> bool {
-           auto left_in_vars =
-               details::DynamicCast<details::VarHandle>(left->Inputs());
-           auto right_in_vars =
-               details::DynamicCast<details::VarHandle>(right->Inputs());
-           PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
-           PADDLE_ENFORCE_GT(right_in_vars.size(), 0);
-           return left_in_vars[0]->Name() > right_in_vars[0]->Name();
-         });
+    sort(
+        current_all_reduce_op_handles.begin(),
+        current_all_reduce_op_handles.end(),
+        [](const details::OpHandleBase* left,
+           const details::OpHandleBase* right) -> bool {
+          auto left_in_vars =
+              details::DynamicCast<details::VarHandle>(left->Inputs());
+          auto right_in_vars =
+              details::DynamicCast<details::VarHandle>(right->Inputs());
+          PADDLE_ENFORCE_GT(left_in_vars.size(), 0,
+                            platform::errors::InvalidArgument(
+                                "OpHandle(%s) inputs size must greater than 0.",
+                                left->Name()));
+          PADDLE_ENFORCE_GT(right_in_vars.size(), 0,
+                            platform::errors::InvalidArgument(
+                                "OpHandle(%s) inputs size must greater than 0.",
+                                right->Name()));
+          return left_in_vars[0]->Name() > right_in_vars[0]->Name();
+        });
 
     all_reduce_op_handles->insert(all_reduce_op_handles->end(),
                                   current_all_reduce_op_handles.begin(),
@@ -170,7 +181,10 @@ class AllReduceDepsPass : public ir::Pass {
           break;
         }
       }
-      PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
+      PADDLE_ENFORCE_EQ(
+          find_valid_input, true,
+          platform::errors::NotFound(
+              "In OpHandle(%s) Doesn't find valid input.", op->Name()));
     }
     VLOG(10) << out2.str();
     if (grads_of_stale_program != all_reduce_op_handles.size()) {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
index 782c51a032c03..2aae14fa33391 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
@@ -179,9 +179,10 @@ class BackWardOpDepsPass : public ir::Pass {
     // Currently, we assume that once gradient is generated, it can be
     // broadcast, and each gradient is only broadcast once.
     auto backward_vars = details::GetOpRoleVarsOrEmpty(op_desc);
-    PADDLE_ENFORCE_EQ(node->IsWrappedBy<details::OpHandleBase>(), true,
-                      platform::errors::InvalidArgument(
-                          "Node must be wrapped by OpHandleBase"));
+    PADDLE_ENFORCE_EQ(
+        node->IsWrappedBy<details::OpHandleBase>(), true,
+        platform::errors::InvalidArgument(
+            "Node(%s) must be wrapped by OpHandleBase.", node->Name()));
 
     backward_op_handles->emplace_back(&node->Wrapper<details::OpHandleBase>());
 
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
index 86fbbaf7720be..81c98ecf0c0b6 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -64,9 +64,10 @@ class FuseAllReduceOpPass : public ir::Pass {
     PADDLE_ENFORCE_EQ(
         all_reduce_ops.size(), grads.size(),
         platform::errors::Unimplemented(
-            "The number of all_reduce OpHandle is not equal to the "
-            "number of grads. Maybe some gradients are sparse type, "
-            "it is not supported currently."));
+            "The number of all_reduce OpHandle(%d) is not equal to the "
+            "number of grads(%d). Maybe some gradients are sparse type, "
+            "it is not supported currently.",
+            all_reduce_ops.size(), grads.size()));
 
     auto &group_params_grads = graph->Get<details::GroupParamsAndGrads>(
         details::kGroupParamsAndDenseGrads);
@@ -79,7 +80,10 @@ class FuseAllReduceOpPass : public ir::Pass {
 
     for (auto &group_p_g : group_params_grads) {
       size_t group_size = group_p_g.size();
-      PADDLE_ENFORCE_GT(group_size, static_cast<size_t>(0));
+      PADDLE_ENFORCE_GT(
+          group_size, static_cast<size_t>(0),
+          platform::errors::InvalidArgument(
+              "Parameter and Parameter@grad in one group, must not be empty."));
       std::vector<ir::Node *> group_all_reduce_ops;
       group_all_reduce_ops.reserve(group_size);
       for (auto &p_g : group_p_g) {
@@ -103,26 +107,40 @@ class FuseAllReduceOpPass : public ir::Pass {
     all_reduce_ops.reserve(grads.size());
     for (auto &node : result.Nodes()) {
       if (node->IsOp()) {
-        PADDLE_ENFORCE(node->IsWrappedBy<details::OpHandleBase>());
+        PADDLE_ENFORCE_EQ(
+            node->IsWrappedBy<details::OpHandleBase>(), true,
+            platform::errors::InvalidArgument(
+                "Op Node(%s) should Wrapped by OpHandleBase.", node->Name()));
         auto *all_reduce_op_handle = dynamic_cast<details::AllReduceOpHandle *>(
             &node->Wrapper<details::OpHandleBase>());
         if (all_reduce_op_handle) {
 #if defined(PADDLE_WITH_DGC)
           PADDLE_ENFORCE_NE(
               all_reduce_op_handle->Name(), "sparse_all_reduce",
-              "DGC doesn't support fuse for now, if you want to use DGC "
-              "you need set strategy.fuse_all_reduce_ops = False.");
+              platform::errors::InvalidArgument(
+                  "DGC doesn't support fuse for now, if you want to use DGC "
+                  "you need set strategy.fuse_all_reduce_ops = False."));
 #endif
           auto inputs = details::DynamicCast<details::VarHandle>(
               all_reduce_op_handle->Inputs());
-          PADDLE_ENFORCE_EQ(inputs.size(), num_place);
+          PADDLE_ENFORCE_EQ(inputs.size(), num_place,
+                            platform::errors::InvalidArgument(
+                                "The input size(%d) of all reduce op must "
+                                "equal to place cnt(%d)!",
+                                inputs.size(), num_place));
           // The inputs' name should be the same.
           auto &grad_name = inputs[0]->name();
           for (size_t i = 1; i < inputs.size(); ++i) {
-            PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
-                              "The input name should be the same.");
+            PADDLE_ENFORCE_EQ(
+                inputs[i]->name(), grad_name,
+                platform::errors::InvalidArgument(
+                    "The input name should be the same.diff name: %s %s.",
+                    inputs[i]->name(), grad_name));
           }
-          PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
+          PADDLE_ENFORCE_NE(
+              grads.count(grad_name), static_cast<size_t>(0),
+              platform::errors::InvalidArgument(
+                  "Parameter@grad(%s) must in grad set.", grad_name));
           all_reduce_ops.emplace(grad_name, node);
         }
       }
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
index 8cc33a6ceb9f1..73f8cd67ee89e 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
@@ -24,7 +24,10 @@ namespace ir {
 class SSAGraghBuilderWithChecker : public ir::Pass {
  protected:
   void ApplyImpl(ir::Graph *graph) const override {
-    PADDLE_ENFORCE(IsValidGraph(graph));
+    PADDLE_ENFORCE_EQ(
+        IsValidGraph(graph), true,
+        platform::errors::InvalidArgument(
+            "In SSAGraghBuilderWithChecker, invalid Graph input."));
   }
 
   bool IsValidGraph(const ir::Graph *graph) const {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 4fbd8a878a7cf..fd82d6b10e718 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -163,7 +163,13 @@ void MultiDevSSAGraphBuilderBase::Init() const {
     nccl_ctxs_ = multi_nccl_ctxs_->DefaultFlatCtx();
   }
 #endif
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(
+      places_.size(), local_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "Places size and LocalScopes not equal "
+          "Places size(%d), LocalScopes size(%d) "
+          "If use multi devices， Places size must equas to LocalScopes size.",
+          places_.size(), local_scopes_.size()));
 }
 
 void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const {
@@ -500,7 +506,11 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
 
     SetCommunicationContext(op_handle, places_[i]);
     auto &vars = result->Get<details::GraphVars>(details::kGraphVars)[i][og];
-    PADDLE_ENFORCE(!vars.empty());
+    PADDLE_ENFORCE_EQ(vars.empty(), false,
+                      platform::errors::InvalidArgument(
+                          "Can not find Var(%s) in Place[%d] "
+                          "Paddle Can not add AllReduce OP for Var(%s).",
+                          og, i, og));
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad);
     VLOG(10) << "all_reduce_op_handle add input " << prev_grad->DebugString();
@@ -566,7 +576,11 @@ details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
     auto &vars = result->Get<details::GraphVars>(details::kGraphVars)[i][og];
-    PADDLE_ENFORCE(!vars.empty());
+    PADDLE_ENFORCE_EQ(vars.empty(), false,
+                      platform::errors::InvalidArgument(
+                          "Can not find Var(%s) in Place[%d] "
+                          "Paddle Can not add Reduce OP for Var(%s).",
+                          og, i, og));
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad);
   }
@@ -590,7 +604,11 @@ bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
 
 bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
     const std::string &og) const {
-  PADDLE_ENFORCE(all_vars_.count(og) != 0);
+  PADDLE_ENFORCE_NE(all_vars_.count(og), 0,
+                    platform::errors::InvalidArgument(
+                        "Can not find Var(%s) in VarDescs "
+                        "Paddle Can not add Collective OP for Var(%s).",
+                        og, og));
   return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS;
 }
 
@@ -641,10 +659,20 @@ int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
       std::vector<std::string>,
       node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
-  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      param_grad.size(), 2U,
+      platform::errors::InvalidArgument(
+          "In Node %s, the size of attribute %s must be 2, include Parameter "
+          "and Parameter@Grad.",
+          node->Name(), OpProtoAndCheckerMaker::OpRoleVarAttrName()));
   int dev_id = GetVarDeviceID(param_grad[1]);
-  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
-                    node->Op()->Type(), param_grad[0], param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, platform::errors::NotFound(
+                                    "Can not find Device ID, for NodeName:%s, "
+                                    "NodeType:%s, Param:%s, Param@Grad:%s"
+                                    "For this fault, you can consult the "
+                                    "Paddle technical personnel for answer ",
+                                    node->Name(), node->Op()->Type(),
+                                    param_grad[0], param_grad[1]));
   return dev_id;
 }
 
@@ -654,10 +682,16 @@ size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID(
   for (auto var_name : var_names) {
     if (all_vars_.find(var_name) == all_vars_.end()) continue;
     auto var_desc = all_vars_.at(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    PADDLE_ENFORCE_NOT_NULL(var_desc,
+                            platform::errors::NotFound(
+                                "Can not find Var(%s) in Var Desc.", var_name));
     auto dim = framework::make_ddim(var_desc->GetShape());
     int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GT(numel, 0);
+    PADDLE_ENFORCE_GT(numel, 0,
+                      platform::errors::InvalidArgument(
+                          "The numel of Var(%s) must greater than 0"
+                          "Please check your code，about Var(%s) Shape.",
+                          var_name, var_name));
     numel_sum += numel;
   }
 
@@ -736,7 +770,12 @@ int ReduceSSAGraphBuilder::GetOpDeviceID(
       std::vector<std::string>,
       node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
-  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      param_grad.size(), 2U,
+      platform::errors::InvalidArgument(
+          "In Node %s, The size of attribute %s must be 2, include Parameter "
+          "and Parameter@Grad.",
+          node->Name(), OpProtoAndCheckerMaker::OpRoleVarAttrName()));
   int dev_id = GetVarDeviceID(param_grad[1]);
 
   if (dev_id == -1) {
@@ -798,7 +837,12 @@ std::vector<ir::Node *> ReduceSSAGraphBuilder::SortForReduceMode(
     }
   }
 
-  PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size());
+  PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size(),
+                    platform::errors::InvalidArgument(
+                        "Sorted ops calc error!"
+                        "The result for sorted ops size(%d) must be "
+                        "equal to topo ops size(%d).",
+                        sorted_ops.size(), topo_ops.size()));
 
   ResetState();
   return sorted_ops;
@@ -820,14 +864,23 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
   bool insert_op = false;
   if (OpHaveRole(*node, OpRole::kRPC)) {
     int op_dev_id = CreateRPCOp(result, node);
-    PADDLE_ENFORCE(op_dev_id != -1,
-                   "Can not schedule the RPC operator to the right place.");
+    PADDLE_ENFORCE_NE(op_dev_id, -1, platform::errors::InvalidArgument(
+                                         "Can not schedule the RPC operator to "
+                                         "the right place. NodeName:%s.",
+                                         node->Name()));
     if (node->Op()->Type() == "recv") {
       auto recv_vars_attr =
           BOOST_GET_CONST(std::vector<std::string>,
                           node->Op()->GetNullableAttr(
                               OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
+      PADDLE_ENFORCE_EQ(
+          recv_vars_attr.size(), 2UL,
+          platform::errors::InvalidArgument(
+              "In Node %s, the size of attribute %s must be 2, include "
+              "Parameter and Parameter@Grad.",
+              node->Name(),
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));  // [parameter,
+                                                              // gradient]
       if (recv_vars_attr[0].find(".block") == std::string::npos) {
         bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]);
       }
@@ -879,8 +932,9 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
     op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
-    PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
-                   "This hack no longer holds, please fix.");
+    PADDLE_ENFORCE_EQ(ir::IsControlDepVar(*node->inputs[0]), false,
+                      platform::errors::InvalidArgument(
+                          "This hack no longer holds, please fix."));
     // the variable name which contains .block means it was split by
     // split_byref op
     if (strategy_.reduce_ ==
@@ -893,7 +947,12 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
       auto send_param_grad = BOOST_GET_CONST(
           std::vector<std::string>,
           node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
+      PADDLE_ENFORCE_EQ(
+          send_param_grad.size(), 2U,
+          platform::errors::InvalidArgument(
+              "In Node %s, the size of attribute %s must be 2, include "
+              "Parameter and Parameter@Grad.",
+              node->Name(), OpProtoAndCheckerMaker::OpRoleVarAttrName()));
       op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
       VLOG(10) << "send grad " << input_var_names[0] << " origin "
                << send_param_grad[1] << " place: " << op_dev_id;
@@ -926,9 +985,10 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
     op_dev_id = 0;
   }
 
-  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
-                 node->Op()->Type());
-
+  PADDLE_ENFORCE_NE(
+      op_dev_id, -1,
+      platform::errors::NotFound("Can not find the right place for rpc op: %s.",
+                                 node->Op()->Type()));
   // Create fetch_barrier op handle to enable output on all devices.
   // **NOTE** fetch_barrier should output variables list same as recv op does.
   if (node->Op()->Type() == "fetch_barrier") {
@@ -956,7 +1016,10 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
       int outvar_dev_id = op_dev_id;
       if (node->Op()->Type() == "fetch_barrier") {
         outvar_dev_id = GetVarDeviceID(output->Name());
-        PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1,
+                          platform::errors::NotFound(
+                              "Can not find the right place for the var: %s.",
+                              output->Name()));
       }
       p = places_[outvar_dev_id];
       ir::Node *new_node = nullptr;
@@ -1007,13 +1070,14 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   } else {
     LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
     PADDLE_THROW(
-        "the distribute training related op should be in [split_byref, "
-        "concat].");
+        platform::errors::Unimplemented("The distribute training related op "
+                                        "should be in [split_byref, concat]."));
   }
 
-  PADDLE_ENFORCE(op_dev_id != -1,
-                 "can not find right place for distributed op: %s",
-                 node->Op()->Type());
+  PADDLE_ENFORCE_NE(op_dev_id, -1,
+                    platform::errors::NotFound(
+                        "Can not find right place for distributed op: %s.",
+                        node->Op()->Type()));
 
   CreateComputationalOp(result, node, op_dev_id);
   return op_dev_id;
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
index efd549e79d0ef..a080b4bc33c53 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
@@ -28,7 +28,10 @@ class SSAGraghBuilderWithPrinterPass : public ir::Pass {
   void ApplyImpl(ir::Graph *graph) const override {
     std::unique_ptr<std::ostream> fout(
         new std::ofstream(Get<std::string>(kGraphvizPath)));
-    PADDLE_ENFORCE(fout->good());
+    PADDLE_ENFORCE_EQ(
+        fout->good(), true,
+        platform::errors::Unavailable("Open file fail! kGraphvizPath = %s.",
+                                      Get<std::string>(kGraphvizPath)));
     if (Has("graph_printer")) {
       Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
     } else {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
index 7de3b7c605418..bcbd1e066cc1f 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
@@ -54,11 +54,16 @@ class SequentialExecutionPass : public ir::Pass {
       if (!node->IsOp()) continue;
       std::unordered_set<ir::Node *> preceding_ops;
       for (auto *in : node->inputs) {
-        PADDLE_ENFORCE(in->IsVar(),
-                       "Preceding Node of Op Nodes must be Var Node");
+        PADDLE_ENFORCE_EQ(
+            in->IsVar(), true,
+            platform::errors::InvalidArgument(
+                "Preceding Node(%s) of Op Nodes must be Var Node.",
+                in->Name()));
         if (in->inputs.empty()) continue;
-        PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp(),
-                       "Preceding Op Node of Var Node must be unique");
+        PADDLE_ENFORCE_EQ((in->inputs.size() == 1 && in->inputs[0]->IsOp()),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "Preceding Op Node of Var Node must be unique."));
         preceding_ops.insert(in->inputs[0]);
         pending_ops[in->inputs[0]].insert(node);
       }
@@ -72,15 +77,18 @@ class SequentialExecutionPass : public ir::Pass {
       ir::Node *found_node = nullptr;
       for (auto *node : ready_ops) {
         if (IsSameOpDesc(op_desc, node->Op())) {
-          PADDLE_ENFORCE(found_node == nullptr,
-                         "Found multiple op_desc in graph: %s",
-                         op_desc->Type());
+          PADDLE_ENFORCE_EQ(
+              found_node, nullptr,
+              platform::errors::InvalidArgument(
+                  "Found multiple op_desc in graph: %s.", op_desc->Type()));
           found_node = node;
         }
       }
 
-      PADDLE_ENFORCE_NOT_NULL(found_node, "Cannot find op_desc in graph: %s",
-                              op_desc->Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          found_node,
+          platform::errors::NotFound("Cannot find op_desc in graph: %s.",
+                                     op_desc->Type()));
       for (auto *pending_op : pending_ops[found_node]) {
         if (--op_deps.at(pending_op) == 0) {
           ready_ops.insert(pending_op);
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index fbc0d7599eae1..87e7e64acb71a 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -66,12 +66,18 @@ class Node {
   std::string Name() const { return name_; }
 
   VarDesc* Var() const {
-    PADDLE_ENFORCE_EQ(IsVar(), true);
+    PADDLE_ENFORCE_EQ(IsVar(), true,
+                      platform::errors::InvalidArgument(
+                          "Node(%s) must be kVariable type, but not %d.", name_,
+                          static_cast<int>(type_)));
     return var_desc_.get();
   }
 
   OpDesc* Op() const {
-    PADDLE_ENFORCE_EQ(IsOp(), true);
+    PADDLE_ENFORCE_EQ(IsOp(), true,
+                      platform::errors::InvalidArgument(
+                          "Node(%s) must be kOperation type, but not %d.",
+                          name_, static_cast<int>(type_)));
     return op_desc_.get();
   }
 
@@ -92,8 +98,9 @@ class Node {
     try {
       return *boost::any_cast<T*>(wrapper_);
     } catch (boost::bad_any_cast&) {
-      PADDLE_THROW("Invalid wrapper type error, expected %s, actual %s",
-                   typeid(T).name(), wrapper_type_.name());
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid wrapper type error, expected %s, actual %s.",
+          typeid(T).name(), wrapper_type_.name()));
     }
   }
 
@@ -114,8 +121,9 @@ class Node {
   }
 
   void RenameVar(const std::string& new_name) {
-    PADDLE_ENFORCE(type_ == Type::kVariable && var_desc_,
-                   "Must be type of variable");
+    PADDLE_ENFORCE_EQ(
+        type_ == Type::kVariable && var_desc_, true,
+        platform::errors::InvalidArgument("Node must be type of variable."));
     name_ = new_name;
     var_desc_->SetName(new_name);
   }
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 78e8b16126484..fb95504d9a53a 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -26,7 +26,8 @@ namespace ir {
 
 Graph* Pass::Apply(Graph* graph) const {
   CheckPrevPass();
-  PADDLE_ENFORCE(graph, "graph passed to Pass::Apply() cannot be empty.");
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   for (const std::string& attr : required_pass_attrs_) {
     PADDLE_ENFORCE_NE(
         attrs_.find(attr), attrs_.end(),
@@ -40,11 +41,14 @@ Graph* Pass::Apply(Graph* graph) const {
   }
   ApplyImpl(graph);
   // TODO(panyx0718): Add more verifications.
-  PADDLE_ENFORCE(!HasCircle(*graph),
-                 "Illegal Pass %s. Generated graph shouldn't have cycle.",
-                 Type());
-  PADDLE_ENFORCE(VarDescIsConsistency(*graph),
-                 "The VarDescs of persistable variable are not consistency.");
+  PADDLE_ENFORCE_EQ(
+      HasCircle(*graph), false,
+      platform::errors::InvalidArgument(
+          "Illegal pass %s. Generated graph shouldn't contain cycle.", Type()));
+  PADDLE_ENFORCE_EQ(
+      VarDescIsConsistency(*graph), true,
+      platform::errors::InvalidArgument(
+          "The VarDescs of persistable variable are not consistency."));
   applied_ = true;
   if (!graph->Has(kPassRecorder)) {
     graph->Set<PassRecorder>(kPassRecorder, new PassRecorder);
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index b7b46085b9067..0f5ef551f044d 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -55,8 +55,9 @@ class Pass {
   // Get a reference to the attributed previously set.
   template <typename AttrType>
   AttrType &Get(const std::string &attr_name) const {
-    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
-                   "%s attr not registered for pass.", attr_name);
+    PADDLE_ENFORCE_NE(attrs_.find(attr_name), attrs_.end(),
+                      platform::errors::InvalidArgument(
+                          "Attribute %s not registered for pass.", attr_name));
     try {
       return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
     } catch (boost::bad_any_cast &) {
@@ -76,7 +77,7 @@ class Pass {
       };
 
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid type for attritube %s, expected: %s, actual: %s", attr_name,
+          "Invalid type for attritube %s, expected: %s, actual: %s.", attr_name,
           TypeToString(typeid(AttrType *)),
           TypeToString(attrs_.at(attr_name).type())));
     }
@@ -101,9 +102,10 @@ class Pass {
   template <typename AttrType>
   void Set(const std::string &attr_name, AttrType *attr) {
     if (default_pass_attrs_.count(attr_name) == 0) {
-      PADDLE_ENFORCE_EQ(attrs_.count(attr_name), 0,
-                        platform::errors::InvalidArgument(
-                            "Attribute %s already set in the pass", attr_name));
+      PADDLE_ENFORCE_EQ(
+          attrs_.count(attr_name), 0,
+          platform::errors::AlreadyExists(
+              "Attribute %s already set in the pass.", attr_name));
     } else {
       VLOG(3) << "Setting the attribute " << attr_name << " for the pass "
               << type_;
@@ -119,15 +121,16 @@ class Pass {
   // should delete the attribute.
   template <typename AttrType>
   void SetNotOwned(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
-                   attr_name);
+    PADDLE_ENFORCE_EQ(attrs_.count(attr_name), 0,
+                      platform::errors::AlreadyExists(
+                          "Attribute %s already set in the pass.", attr_name));
     attrs_[attr_name] = attr;
   }
 
  protected:
   virtual void ApplyImpl(Graph *graph) const {
     PADDLE_THROW(platform::errors::Unimplemented(
-        "The virtual Pass called is not implemented."));
+        "The virtual pass called is not implemented."));
   }
 
   // Some Pass must be placed before this Pass, and some
@@ -198,8 +201,9 @@ class PassRegistry {
   }
 
   std::unique_ptr<Pass> Get(const std::string &pass_type) const {
-    PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered",
-                   pass_type);
+    PADDLE_ENFORCE_EQ(Has(pass_type), true,
+                      platform::errors::InvalidArgument(
+                          "Pass %s has not been registered.", pass_type));
     return map_.at(pass_type)();
   }
 
@@ -213,8 +217,10 @@ class PassRegistry {
 template <typename PassType>
 struct PassRegistrar : public Registrar {
   explicit PassRegistrar(const char *pass_type) {
-    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
-                   "'%s' is registered more than once.", pass_type);
+    PADDLE_ENFORCE_EQ(
+        PassRegistry::Instance().Has(pass_type), false,
+        platform::errors::AlreadyExists(
+            "Pass '%s' is registered more than once.", pass_type));
     PassRegistry::Instance().Insert(
         pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
           std::unique_ptr<Pass> pass(new PassType());
diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc
index 8355764aa6c98..6457bd230c59c 100644
--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
@@ -28,13 +28,19 @@ std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
 }
 
 void PassBuilder::RemovePass(size_t idx) {
-  PADDLE_ENFORCE(passes_.size() > idx);
+  PADDLE_ENFORCE_GT(
+      passes_.size(), idx,
+      platform::errors::InvalidArgument(
+          "Passes size is %d, %d is not a valid index.", passes_.size(), idx));
   passes_.erase(passes_.begin() + idx);
 }
 
 std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
                                               const std::string& pass_type) {
-  PADDLE_ENFORCE(passes_.size() >= idx);
+  PADDLE_ENFORCE_GE(
+      passes_.size(), idx,
+      platform::errors::InvalidArgument(
+          "Passes size is %d, %d is not a valid index.", passes_.size(), idx));
   std::shared_ptr<Pass> pass(
       ir::PassRegistry::Instance().Get(pass_type).release());
   passes_.insert(passes_.begin() + idx, std::move(pass));
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
index 14e94a2bc5c51..0c5286b3f77e1 100644
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -119,7 +119,7 @@ TEST(PassTest, TestPassAttrCheck) {
   } catch (paddle::platform::EnforceNotMet& e) {
     exception = std::string(e.what());
   }
-  ASSERT_TRUE(exception.find("shouldn't have cycle") != exception.npos);
+  ASSERT_TRUE(exception.find("shouldn't contain cycle") != exception.npos);
 
   pass = PassRegistry::Instance().Get("test_pass");
   pass->Set<int>("test_pass_attr", new int);
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 1f1a54f140b0d..4506c162fa743 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -43,9 +43,11 @@ void DeleteQuant(ir::Graph* graph, Scope* scope,
   // ops linked from it
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    PADDLE_ENFORCE_EQ(subgraph.count(input_act_node), true,
-                      platform::errors::NotFound(
-                          "Input act node not found in Delete Quant fusion."));
+    PADDLE_ENFORCE_EQ(
+        subgraph.count(input_act_node), true,
+        platform::errors::NotFound(
+            "Input act node(%s) not found in QuantDequantFuse pass.",
+            input_act_node->name()));
     Node* input_act = subgraph.at(input_act_node);
     Node* input_scale = subgraph.at(pattern.GetPDNode("input_scale_node"));
     Node* quant = subgraph.at(pattern.GetPDNode("quant_node"));
@@ -58,7 +60,7 @@ void DeleteQuant(ir::Graph* graph, Scope* scope,
     std::string input_scale_var_name = quant->Op()->Input("InScale").front();
     PADDLE_ENFORCE_NOT_NULL(
         scope, platform::errors::InvalidArgument(
-                   "scope in DeleteQuantOpFuse pass should not be null."));
+                   "Scope in QuantDequantFuse pass should not be null."));
     const LoDTensor& input_scale_tensor =
         scope->FindVar(input_scale_var_name)->Get<LoDTensor>();
     PADDLE_ENFORCE_EQ(
@@ -84,8 +86,8 @@ void DeleteQuant(ir::Graph* graph, Scope* scope,
       } else if (quantized_op_type == "mul") {
         op_desc->SetAttr("X_scale", scale_value);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unsupported quantized op type %s", quantized_op_type));
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported quantized op type %s.", quantized_op_type));
       }
       op_desc->SetAttr("bit_length", bit_length);
       op_desc->RenameInput(output_act_name, input_act_name);
@@ -119,9 +121,9 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     weight_name = "W";
     input_name = "Input";
   } else {
-    PADDLE_ENFORCE(
+    PADDLE_THROW(platform::errors::Unimplemented(
         "QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for "
-        "now.");
+        "now."));
   }
   const std::string pattern_name = "dequant_fuse";
   GraphPatternDetector gpd;
@@ -141,8 +143,9 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
                      Graph* g) {
     PADDLE_ENFORCE_EQ(
         subgraph.count(quantized_op_input), true,
-        platform::errors::NotFound(
-            "Quantized op input node not found in Delete Quant fusion."));
+        platform::errors::NotFound("Quantized op input node(%s) did not find "
+                                   "in QuantDequantFuse pass.",
+                                   quantized_op_input->name()));
     Node* quantized_op_input_node = subgraph.at(quantized_op_input);
     Node* quantized_op_weight_node =
         subgraph.at(pattern.GetPDNode("quantized_op_weight"));
@@ -165,7 +168,7 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
       PADDLE_ENFORCE_EQ(
           scales_name.size(), 2,
           platform::errors::InvalidArgument(
-              "Scales size in channel-wise dequantize op should be 2, got %d",
+              "Scales size in channel-wise dequantize op should be 2, got %d.",
               scales_name.size()));
       const LoDTensor& channel_scale_tensor =
           scope->FindVar(scales_name[0])->Get<LoDTensor>();
@@ -193,9 +196,10 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     bool valid_scale_size =
         (weight_scale.size() == 1 ||
          weight_scale.size() == static_cast<size_t>(w_dims[0]));
-    PADDLE_ENFORCE_EQ(valid_scale_size, true,
-                      platform::errors::InvalidArgument(
-                          "TRT int8 quant: invalid scale size"));
+    PADDLE_ENFORCE_EQ(
+        valid_scale_size, true,
+        platform::errors::InvalidArgument(
+            "TRT int8 quant: invalid scale size(%d).", weight_scale.size()));
     float* quantized_weight_data =
         weight_tensor->mutable_data<float>(platform::CPUPlace());
     for (int j = 0; j < weight_tensor->numel(); j++) {
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index dddb2affbbad0..2396a7f3c4f84 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -278,11 +278,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
   auto retrieve_node = [](const std::string& name,
                           const GraphPatternDetector::subgraph_t& subgraph,
                           const PDPattern& pat) -> Node* {
-    PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
-                   "pattern has no Node called %s", name.c_str());
+    PADDLE_ENFORCE_GT(subgraph.count(pat.RetrieveNode(name)), 0,
+                      platform::errors::NotFound(
+                          "Pattern has no node called %s.", name.c_str()));
     Node* p = subgraph.at(pat.RetrieveNode(name));
-    PADDLE_ENFORCE_NOT_NULL(
-        p, platform::errors::NotFound("subgraph has no node %s", name.c_str()));
+    PADDLE_ENFORCE_NOT_NULL(p, platform::errors::NotFound(
+                                   "Subgraph has no node %s.", name.c_str()));
     return p;
   };
 
@@ -365,7 +366,8 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 }
 
 void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   int fusion_count = 0;
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
index 81d9476d409d9..283fe3797e454 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
@@ -55,9 +55,15 @@ void TestMain(int num_fc) {
   VLOG(3) << DebugString(graph);
 
   // Delete (num_fc_nodes_before - 1) fc ops
-  PADDLE_ENFORCE_EQ(num_nodes_before - (num_fc_nodes_before - 1) + 1,
-                    num_nodes_after);
-  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1);
+  PADDLE_ENFORCE_EQ(
+      num_nodes_before - (num_fc_nodes_before - 1) + 1, num_nodes_after,
+      platform::errors::InvalidArgument(
+          "num_nodes_before = %d, num_fc_nodes_before = %d, num_nodes_after = "
+          "%d.",
+          num_nodes_before, num_fc_nodes_before, num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1,
+                    platform::errors::InvalidArgument(
+                        "num_fused_nodes_after = %d.", num_fused_nodes_after));
 }
 
 TEST(RepeatedFCReluFusePass, basic_3) { TestMain(3); }
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
index bd826709b1d88..19ec2d818a3db 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -185,11 +185,13 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
   auto* concat_out = BuildSeqExpandConcatPattern(pattern);
   BuildFCPattern(pattern, concat_out);
 
-#define GET_NODE(id, pattern)                               \
-  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
-                 "pattern has no Node called %s", #id);     \
-  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
-  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+#define GET_NODE(id, pattern)                                             \
+  PADDLE_ENFORCE_GT(                                                      \
+      subgraph.count(pattern.RetrieveNode(#id)), 0,                       \
+      platform::errors::NotFound("Pattern has no node called %s.", #id)); \
+  auto* id = subgraph.at(pattern.RetrieveNode(#id));                      \
+  PADDLE_ENFORCE_NOT_NULL(                                                \
+      id, platform::errors::NotFound("Subgraph has no node %s.", #id));
 
   int fuse_count{0};
 
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
index ea376b371f592..1c220ee4d5718 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
@@ -139,11 +139,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
   auto retrieve_node = [](const std::string& name,
                           const GraphPatternDetector::subgraph_t& subgraph,
                           const PDPattern& pat) -> Node* {
-    PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
-                   "pattern has no Node called %s", name.c_str());
+    PADDLE_ENFORCE_GT(subgraph.count(pat.RetrieveNode(name)), 0,
+                      platform::errors::NotFound(
+                          "Pattern has no node called %s.", name.c_str()));
     Node* p = subgraph.at(pat.RetrieveNode(name));
-    PADDLE_ENFORCE_NOT_NULL(
-        p, platform::errors::NotFound("subgraph has no node %s", name.c_str()));
+    PADDLE_ENFORCE_NOT_NULL(p, platform::errors::NotFound(
+                                   "Subgraph has no node %s.", name.c_str()));
     return p;
   };
 
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index 92d2a6acbb9f7..d9a65e71592ff 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -47,7 +47,9 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
                      Graph* g) {
     GET_NODES;
 
-    PADDLE_ENFORCE(subgraph.count(x));
+    PADDLE_ENFORCE_GT(
+        subgraph.count(x), 0,
+        platform::errors::NotFound("Detector did not find input X."));
     auto* input_node = subgraph.at(x);
     auto reshape1_desc = reshape1_op->Op();
     auto reshape2_desc = reshape2_op->Op();
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
index 324b9c0b7da24..80f387c442760 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
@@ -59,12 +59,25 @@ TEST(SimplifyWithBasicOpsPass, dropout) {
       int num_scale_nodes_after = GetNumOpNodes(graph, "scale");
       VLOG(3) << DebugString(graph);
 
-      PADDLE_ENFORCE_EQ(num_dropout_nodes_after, 0);
+      PADDLE_ENFORCE_EQ(
+          num_dropout_nodes_after, 0,
+          platform::errors::InvalidArgument("num_dropout_nodes_after = %d.",
+                                            num_dropout_nodes_after));
       if (dropout_implementation == "downgrade_in_infer") {
-        PADDLE_ENFORCE_EQ(num_dropout_nodes_before,
-                          num_scale_nodes_after - num_scale_nodes_before);
+        PADDLE_ENFORCE_EQ(
+            num_dropout_nodes_before,
+            num_scale_nodes_after - num_scale_nodes_before,
+            platform::errors::InvalidArgument(
+                "num_dropout_nodes_before = %d, num_scale_nodes_after = %d, "
+                "num_scale_nodes_before = %d.",
+                num_dropout_nodes_before, num_scale_nodes_after,
+                num_scale_nodes_before));
       } else {
-        PADDLE_ENFORCE_EQ(num_scale_nodes_after - num_scale_nodes_before, 0);
+        PADDLE_ENFORCE_EQ(
+            num_scale_nodes_after - num_scale_nodes_before, 0,
+            platform::errors::InvalidArgument(
+                "num_scale_nodes_after = %d, num_scale_nodes_before = %d.",
+                num_scale_nodes_after, num_scale_nodes_before));
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index 6d908b4362b80..035b198bdcc51 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -300,10 +300,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
   auto retrieve_node = [](const std::string& name,
                           const GraphPatternDetector::subgraph_t& subgraph,
                           const PDPattern& pat) -> Node* {
-    PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
-                   "pattern has no Node called %s", name.c_str());
+    PADDLE_ENFORCE_GT(subgraph.count(pat.RetrieveNode(name)), 0,
+                      platform::errors::NotFound(
+                          "Pattern has no node called %s.", name.c_str()));
     Node* p = subgraph.at(pat.RetrieveNode(name));
-    PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str());
+    PADDLE_ENFORCE_NOT_NULL(p, platform::errors::NotFound(
+                                   "Subgraph has no node %s.", name.c_str()));
     return p;
   };
 
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 90ffaada055a9..9a0a5f07a7080 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -51,15 +51,25 @@ void RunTransposeFlattenConcatFuse(ir::Graph *graph, int times) {
     std::vector<Node *> nodes;
 
     for (int i = 0; i < times; i++) {
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
-      PADDLE_ENFORCE(
-          subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
-      PADDLE_ENFORCE(subgraph.at(input_nodes[i]));
+      PADDLE_ENFORCE_NOT_NULL(
+          subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
+          platform::errors::NotFound("Can not find transpose%d in subgraph.",
+                                     i));
+      PADDLE_ENFORCE_NOT_NULL(
+          subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))),
+          platform::errors::NotFound(
+              "Can not find transpose_out%d in subgraph.", i));
+      PADDLE_ENFORCE_NOT_NULL(
+          subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))),
+          platform::errors::NotFound("Can not find flatten%d in subgraph.", i));
+      PADDLE_ENFORCE_NOT_NULL(
+          subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))),
+          platform::errors::NotFound("Can not find flatten_out%d in subgraph.",
+                                     i));
+      PADDLE_ENFORCE_NOT_NULL(
+          subgraph.at(input_nodes[i]),
+          platform::errors::NotFound("Can not find %s in subgraph.",
+                                     input_nodes[i]->name()));
 
       nodes.push_back(subgraph.at(input_nodes[i]));
       nodes.push_back(
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index c62835e51be0d..21d3454467603 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -117,7 +117,7 @@ TEST(OpRegistry, IllegalAttr) {
     paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::platform::EnforceNotMet& err) {
     caught = true;
-    std::string msg = "larger_than check fail";
+    std::string msg = "OutOfRangeError";
     std::string err_msg = err.what();
     ASSERT_TRUE(err_msg.find(msg) != std::string::npos);
   }
@@ -151,7 +151,7 @@ TEST(OpRegistry, CustomChecker) {
     paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::platform::EnforceNotMet& err) {
     caught = true;
-    std::string msg = "Attribute 'test_attr' is required!";
+    std::string msg = "InvalidArgumentError";
     std::string err_msg = err.what();
     ASSERT_TRUE(err_msg.find(msg) != std::string::npos);
   }
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index df1e0fb6d5b48..544c014eaf98a 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -108,8 +108,15 @@ const DDim& Tensor::dims() const { return dims_; }
 int64_t Tensor::numel() const { return product(dims_); }
 
 void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
+  PADDLE_ENFORCE_EQ(
+      offset_, 0,
+      platform::errors::Fatal(
+          "Only the offset is supported to zero when the holder is reset."));
   if (holder_) {
-    PADDLE_ENFORCE_EQ(numel() * SizeOfType(type()), holder->size());
+    PADDLE_ENFORCE_LE(
+        numel() * SizeOfType(type()) + offset_, holder->size(),
+        paddle::platform::errors::InvalidArgument(
+            "The size of Holder is not enough to store the Tensor."));
   }
   holder_ = holder;
 }
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 60bc88ca7237c..de1246883f101 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -33,8 +33,10 @@
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) {
+void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy,
+                       bool retain_graph) {
   backward_strategy_ = strategy;
+  retain_graph_ = retain_graph;
   init_node_ = var->GradVarBase()->GradNode();
   var->GradVarBase()->ClearGradNode();
 
@@ -205,7 +207,9 @@ void BasicEngine::Execute() {
             continue;
           }
 
-          var = std::make_shared<VariableWrapper>(var->Name());
+          auto tmp_var = std::make_shared<VariableWrapper>(var->Name());
+          tmp_var->SetType(var->Type());
+          var = tmp_var;
           need_accu_var_list_.emplace_back(iter->second.get(), var);
         }
       }
@@ -224,7 +228,9 @@ void BasicEngine::Execute() {
       need_accu_var_list_.clear();
 
       VLOG(3) << "Remove op after op " << cur_op.Type() << " runs";
-      cur_op.ClearBackwardTrace();
+      if (!retain_graph_) {
+        cur_op.ClearBackwardTrace();
+      }
     }
 
     // Step 3: Collect ready ops
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index 2d517bb43d39f..4d25d81235098 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -30,7 +30,8 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, const detail::BackwardStrategy& strategy);
+  void Init(VarBase* var, const detail::BackwardStrategy& strategy,
+            bool retain_graph = false);
 
   void Execute() override;
 
@@ -51,6 +52,7 @@ class BasicEngine : public Engine {
       accumulators_;
   std::vector<std::pair<GradientAccumulator*, std::shared_ptr<VariableWrapper>>>
       need_accu_var_list_;
+  bool retain_graph_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 37ea3e5b40a65..9dc96fdfe8622 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -41,11 +41,16 @@ add_subdirectory(api)
 
 # Create static inference library if needed
 # All static libs in inference/api
-set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor zero_copy_tensor reset_tensor_array
-              analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
-create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API})
+set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
+     zero_copy_tensor reset_tensor_array 
+        analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
+if(WIN32)
+  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_API}) 
+else()
+ create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
+endif()
 
-if(NOT APPLE)
+if(NOT APPLE AND NOT WIN32)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
   set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 2fc7f81bf8a59..27bae7a71ea19 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -200,6 +200,10 @@ struct Argument {
   DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
   DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode,
                       AnalysisConfig::Precision);
+  DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
+
+  DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
+  DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 4a79a3cf30503..cd8d86d729384 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -146,6 +146,10 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("predictor_id", new int(argument->predictor_id()));
       pass->Set("enable_int8", new bool(enable_int8));
       pass->Set("use_gpu", new bool(argument->use_gpu()));
+      pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
+      pass->Set("use_xpu", new bool(argument->use_xpu()));
+      pass->Set("xpu_l3_workspace_size",
+                new int(argument->xpu_l3_workspace_size()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 91d0aec3f41fd..6b16a481ddedb 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -242,16 +242,33 @@ void LiteSubgraphPass::SetUpEngine(
 
   bool use_gpu = Get<bool>("use_gpu");
   bool enable_int8 = Get<bool>("enable_int8");
-  lite_api::TargetType target_type = use_gpu ? TARGET(kCUDA) : TARGET(kX86);
+  bool use_xpu = Get<bool>("use_xpu");
+  int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
+
+  lite_api::TargetType target_type;
+  if (use_gpu) {
+    target_type = TARGET(kCUDA);
+  } else if (use_xpu) {
+    target_type = TARGET(kXPU);
+  } else {
+    target_type = TARGET(kX86);
+  }
+
   paddle::lite_api::PrecisionType precision_type =
-      enable_int8 ? PRECISION(kInt8) : PRECISION(kInt64);
+      enable_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
+
   serialize_params(&config.param, scope, repetitive_params);
   config.model = program->Proto()->SerializeAsString();
   config.valid_places = {
+      // Notice: The ordering here determines the device where the
+      // input tensor of the Lite engine is located, and then affects
+      // whether tensor sharing is feasible.
       paddle::lite::Place({target_type, precision_type}),
+      paddle::lite::Place({target_type, PRECISION(kInt64)}),
       paddle::lite::Place({target_type, PRECISION(kFloat)}),
       paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
   };
+  config.xpu_l3_workspace_size = xpu_l3_workspace_size;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
     lite::StrToBinaryFile("./param.bin", config.param);
@@ -283,6 +300,7 @@ void LiteSubgraphPass::BuildOperator(
   op_desc->SetAttr("engine_key", unique_key);
   op_desc->SetAttr("enable_int8", Get<bool>("enable_int8"));
   op_desc->SetAttr("use_gpu", Get<bool>("use_gpu"));
+  op_desc->SetAttr("zero_copy", Get<bool>("zero_copy"));
 }
 
 void LiteSubgraphPass::ApplyImpl(framework::ir::Graph* graph) const {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 994f7c9535263..61886c225e654 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -88,6 +88,12 @@ void AnalysisConfig::DisableFCPadding() {
   Update();
 }
 
+void AnalysisConfig::EnableXpu(int l3_workspace_size) {
+  use_xpu_ = true;
+  xpu_l3_workspace_size_ = l3_workspace_size;
+  Update();
+}
+
 AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
 
@@ -132,6 +138,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(lite_precision_mode_);
   CP_MEMBER(lite_passes_filter_);
   CP_MEMBER(lite_ops_filter_);
+  CP_MEMBER(lite_zero_copy_);
+
+  CP_MEMBER(use_xpu_);
+  CP_MEMBER(xpu_l3_workspace_size_);
 
   // profile related.
   CP_MEMBER(with_profile_);
@@ -344,6 +354,22 @@ void AnalysisConfig::Update() {
     }
   }
 
+  if (use_xpu_) {
+#ifndef PADDLE_WITH_XPU
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to use an XPU device, but Paddle was not compiled "
+        "with XPU-runtime."));
+#endif
+    if (!use_lite_) {
+      LOG(WARNING) << "Because XPU currently only works in Paddle-Lite "
+                      "subgraph mode, please make sure you have enabled it.";
+    }
+    PADDLE_ENFORCE_EQ(use_gpu_, false,
+                      platform::errors::Unavailable(
+                          "Currently, XPU and GPU cannot be enabled in the "
+                          "same analysis configuration."));
+  }
+
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
@@ -387,6 +413,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << cpu_math_library_num_threads_;
 
   ss << use_lite_;
+  ss << use_xpu_;
+  ss << xpu_l3_workspace_size_;
 
   ss << thread_local_stream_;
 
@@ -464,13 +492,14 @@ void AnalysisConfig::DisableGlogInfo() {
 }
 
 void AnalysisConfig::EnableLiteEngine(
-    AnalysisConfig::Precision precision_mode,
+    AnalysisConfig::Precision precision_mode, bool zero_copy,
     const std::vector<std::string> &passes_filter,
     const std::vector<std::string> &ops_filter) {
   use_lite_ = true;
   lite_precision_mode_ = precision_mode;
   lite_passes_filter_ = passes_filter;
   lite_ops_filter_ = ops_filter;
+  lite_zero_copy_ = zero_copy;
   Update();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 195053814e6a0..a8c8058c6b714 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -465,6 +465,9 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
     argument_.SetLitePassesFilter(config_.lite_passes_filter_);
     argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
+    argument_.SetLiteZeroCopy(config_.lite_zero_copy_);
+    argument_.SetUseXpu(config_.use_xpu_);
+    argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index bfa273d4468db..d8d9e2187815d 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -72,7 +72,7 @@ if [ $(echo `uname` | grep "Win") != "" ]; then
     -DWITH_MKL=$TURN_ON_MKL \
     -DDEMO_NAME=simple_on_word2vec \
     -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=ON
+    -DWITH_STATIC_LIB=OFF
   msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
   Release/simple_on_word2vec.exe \
       --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
@@ -88,7 +88,7 @@ if [ $(echo `uname` | grep "Win") != "" ]; then
     -DWITH_MKL=$TURN_ON_MKL \
     -DDEMO_NAME=vis_demo \
     -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=ON
+    -DWITH_STATIC_LIB=OFF
   msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
   for vis_demo_name in $vis_demo_list; do
     Release/vis_demo.exe \
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index ec7b08b306707..6a31ff281c68e 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -176,6 +176,8 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   ///
   void DisableGpu();
+
+  void EnableXpu(int l3_workspace_size = 0xfffc00);
   ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
@@ -319,6 +321,7 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableLiteEngine(
       AnalysisConfig::Precision precision_mode = Precision::kFloat32,
+      bool zero_copy = false,
       const std::vector<std::string>& passes_filter = {},
       const std::vector<std::string>& ops_filter = {});
 
@@ -579,8 +582,11 @@ struct PD_INFER_DECL AnalysisConfig {
   std::vector<std::string> lite_passes_filter_;
   std::vector<std::string> lite_ops_filter_;
   Precision lite_precision_mode_;
+  bool lite_zero_copy_;
 
   bool thread_local_stream_{false};
+  bool use_xpu_{false};
+  int xpu_l3_workspace_size_;
 
   // mkldnn related.
   int mkldnn_cache_capacity_{0};
diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt
index 1d957048148b5..fd513b59588f8 100644
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@@ -1,5 +1,9 @@
+if(XPU_SDK_ROOT)
+  set(XPU_DEPS xpuapi xpurt)
+endif()
+
 cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
-cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto)
-cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost)
+cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
 cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
 cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index fb3b6e460d5bb..8e88c94493952 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -16,8 +16,11 @@
 #define LITE_WITH_CUDA 1
 #endif
 
-#include "paddle/fluid/inference/lite/engine.h"
+#ifdef PADDLE_WITH_XPU
+#define LITE_WITH_XPU 1
+#endif
 
+#include "paddle/fluid/inference/lite/engine.h"
 #include "lite/api/paddle_use_passes.h"
 
 namespace paddle {
@@ -39,10 +42,17 @@ paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
 
 paddle::lite::Predictor* EngineManager::Create(const std::string& name,
                                                const EngineConfig& cfg) {
-  auto* p = new paddle::lite::Predictor();
+  if (cfg.valid_places.front().target == TARGET(kCUDA)) {
 #ifdef PADDLE_WITH_CUDA
-  paddle::lite::Env<TARGET(kCUDA)>::Init();
+    paddle::lite::Env<TARGET(kCUDA)>::Init();
 #endif
+  } else if (cfg.valid_places.front().target == TARGET(kXPU)) {
+#ifdef PADDLE_WITH_XPU
+    paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
+        cfg.xpu_l3_workspace_size;
+#endif
+  }
+  auto* p = new paddle::lite::Predictor();
   p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
            cfg.model_type, cfg.model_from_memory);
   engines_[name].reset(p);
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 5f11c51952bd3..345eb682e9fe8 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -26,6 +26,7 @@
 #include "lite/api/paddle_place.h"
 #include "lite/core/context.h"
 #include "lite/core/device_info.h"
+#include "lite/core/memory.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #pragma GCC diagnostic pop
@@ -42,6 +43,7 @@ struct EngineConfig {
   std::vector<std::string> neglected_passes;
   lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
   bool model_from_memory{true};
+  size_t xpu_l3_workspace_size;
 };
 
 class EngineManager {
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 59087c6fec203..d79a041ccf8a1 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -14,8 +14,10 @@
 
 #include "paddle/fluid/inference/lite/tensor_utils.h"
 #include <map>
+#include <memory>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
 namespace inference {
@@ -46,6 +48,9 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) {
       return platform::CPUPlace();
     case TargetType::kCUDA:
       return platform::CUDAPlace(id);
+    case TargetType::kXPU:
+      LOG(ERROR) << "No corresponding device for XPU yet.";
+      return platform::Place();
     default:
       PADDLE_THROW(
           platform::errors::Unavailable("Unsupported target type. Now only "
@@ -191,6 +196,31 @@ void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
   VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
 }
 
+template <>
+void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
+  const size_t bytes =
+      static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
+  auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
+      src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
+  dst->Resize(framework::vectorize(src->dims()));
+  dst->set_precision(GetLitePrecisionType(src->type()));
+  SetLoD(dst->mutable_lod(), src->lod());
+  dst->ResetBuffer(buf, bytes);
+}
+
+template <>
+void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
+  constexpr framework::proto::VarType::Type dtype =
+      framework::proto::VarType_Type_FP32;
+  void* src_raw_data = src->raw_data();
+  std::shared_ptr<memory::allocation::Allocation> holder(
+      new memory::allocation::Allocation(src_raw_data, src->memory_size(),
+                                         GetNativePlace(src->target())));
+  dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
+  SetLoD(dst->mutable_lod(), src->lod());
+  dst->ResetHolderWithType(holder, dtype);
+}
+
 }  // namespace utils
 }  // namespace lite
 }  // namespace inference
diff --git a/paddle/fluid/inference/lite/tensor_utils.h b/paddle/fluid/inference/lite/tensor_utils.h
index 21c5e794d4195..1b2923bc28033 100644
--- a/paddle/fluid/inference/lite/tensor_utils.h
+++ b/paddle/fluid/inference/lite/tensor_utils.h
@@ -26,6 +26,21 @@ template <typename DstTensor, typename SrcTensor>
 void TensorCopyAsync(DstTensor* dst, const SrcTensor& src,
                      const platform::DeviceContext& ctx);
 
+template <typename DstTensor, typename SrcTensor>
+void TensorDataShare(DstTensor* dst, SrcTensor* src);
+
+template <typename DstTensor, typename SrcTensor>
+void TensorCopy(DstTensor* dst, SrcTensor* src,
+                const platform::DeviceContext& ctx, bool shared = true) {
+  if (shared) {
+    VLOG(3) << "TensorDataShare is running";
+    TensorDataShare(dst, src);
+  } else {
+    VLOG(3) << "TensorCopyAsync is running";
+    TensorCopyAsync(dst, *src, ctx);
+  }
+}
+
 }  // namespace utils
 }  // namespace lite
 }  // namespace inference
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index 48ae1bd71d8a4..eef7bfb68fe06 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -30,7 +30,7 @@ TEST(LiteEngineOp, GetNativePlace) {
   platform::Place GetNativePlace(const TargetType& type, int id = 0);
   EXPECT_TRUE(platform::is_cpu_place(GetNativePlace(TargetType::kHost)));
   EXPECT_TRUE(platform::is_gpu_place(GetNativePlace(TargetType::kCUDA)));
-  ASSERT_DEATH(GetNativePlace(TargetType::kUnk), "");
+  EXPECT_ANY_THROW(GetNativePlace(TargetType::kUnk));
 }
 
 TEST(LiteEngineOp, GetLiteTargetType) {
@@ -48,8 +48,8 @@ TEST(LiteEngineOp, GetLitePrecisionType) {
             PrecisionType::kInt8);
   ASSERT_EQ(GetLitePrecisionType(framework::proto::VarType_Type_INT32),
             PrecisionType::kInt32);
-  ASSERT_DEATH(
-      GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS), "");
+  EXPECT_ANY_THROW(
+      GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS));
 }
 
 TEST(LiteEngineOp, GetNativePrecisionType) {
@@ -62,7 +62,7 @@ TEST(LiteEngineOp, GetNativePrecisionType) {
             framework::proto::VarType_Type_INT8);
   ASSERT_EQ(GetNativePrecisionType(PrecisionType::kInt32),
             framework::proto::VarType_Type_INT32);
-  ASSERT_DEATH(GetNativePrecisionType(PrecisionType::kUnk), "");
+  EXPECT_ANY_THROW(GetNativePrecisionType(PrecisionType::kUnk));
 }
 
 TEST(LiteEngineOp, GetNativeLayoutType) {
@@ -70,14 +70,14 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
   framework::DataLayout GetNativeLayoutType(const DataLayoutType& type);
   ASSERT_EQ(GetNativeLayoutType(DataLayoutType::kNCHW),
             framework::DataLayout::kNCHW);
-  ASSERT_DEATH(GetNativeLayoutType(DataLayoutType::kNHWC), "");
+  EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
 }
 
 void test_tensor_copy(const platform::DeviceContext& ctx) {
   // Create LoDTensor.
   std::vector<float> vector({1, 2, 3, 4});
   framework::LoDTensor lod_tensor;
-  framework::TensorFromVector(vector, &lod_tensor);
+  framework::TensorFromVector(vector, ctx, &lod_tensor);
   framework::LoD lod({{0, 2, 4}});
   lod_tensor.Resize({4, 1});
   lod_tensor.set_lod(lod);
@@ -94,7 +94,26 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
   }
 #endif
   std::vector<float> result;
-  TensorToVector(lod_tensor_n, &result);
+  TensorToVector(lod_tensor_n, ctx, &result);
+  ASSERT_EQ(result, vector);
+  ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
+}
+
+void test_tensor_share(const platform::DeviceContext& ctx) {
+  std::vector<float> vector({1, 2, 3, 4});
+  framework::LoDTensor lod_tensor;
+  framework::TensorFromVector(vector, ctx, &lod_tensor);
+  framework::LoD lod({{0, 2, 4}});
+  lod_tensor.Resize({4, 1});
+  lod_tensor.set_lod(lod);
+  // Create lite::Tensor and share.
+  paddle::lite::Tensor lite_tensor;
+  TensorDataShare(&lite_tensor, &lod_tensor);
+  // Copy to LoDTensor.
+  framework::LoDTensor lod_tensor_n;
+  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  std::vector<float> result;
+  TensorToVector(lod_tensor_n, ctx, &result);
   ASSERT_EQ(result, vector);
   ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
 }
@@ -110,6 +129,17 @@ TEST(LiteEngineOp, TensorCopyAsync) {
 #endif
 }
 
+TEST(LiteEngineOp, TensorShare) {
+  auto* ctx_cpu =
+      platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+  test_tensor_share(*ctx_cpu);
+#ifdef PADDLE_WITH_CUDA
+  auto* ctx_gpu =
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0));
+  test_tensor_share(*ctx_gpu);
+#endif
+}
+
 }  // namespace utils
 }  // namespace lite
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 5f65229ecd52a..65755b7b15ad5 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -56,8 +56,6 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->DisableGpu();
   cfg->SwitchIrDebug();
   cfg->SwitchSpecifyInputNames(false);
-  // TODO(TJ): fix fusion gru
-  cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 0816218a0d18a..bd1908ac65509 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -27,26 +27,23 @@ else ()
     set(AllocatorFacadeDeps)
 endif()
 
-# TODO: Fix this unittest failed on Windows
-if(NOT WIN32)
-  if (WITH_GPU)
-      nv_test(best_fit_allocator_test
-              SRCS best_fit_allocator_test.cc
-                  best_fit_allocator_test.cu
-              DEPS best_fit_allocator
-                  locked_allocator
-                  cpu_allocator
-                  cuda_allocator
-                  device_context
-                  memcpy)
-  else()
-      cc_test(best_fit_allocator_test
-              SRCS best_fit_allocator_test.cc
-              DEPS best_fit_allocator
-                  locked_allocator
-                  cpu_allocator)
-  endif()
-endif(NOT WIN32)
+if (WITH_GPU)
+    nv_test(best_fit_allocator_test
+            SRCS best_fit_allocator_test.cc
+                best_fit_allocator_test.cu
+            DEPS best_fit_allocator
+                locked_allocator
+                cpu_allocator
+                cuda_allocator
+                device_context
+                memcpy)
+else()
+    cc_test(best_fit_allocator_test
+            SRCS best_fit_allocator_test.cc
+            DEPS best_fit_allocator
+                locked_allocator
+                cpu_allocator)
+endif()
 
 list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)
 
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
index fa7662d2f81b1..d20a6fc0e061b 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
@@ -13,11 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+
 #include <memory>
 #include <random>
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
@@ -33,7 +35,10 @@ class StubAllocation : public Allocation {
 };
 
 TEST(BestFitAllocator, test_allocation) {
-  StubAllocation stub(4UL * 1024 * 1024 * 1024);
+  // NOTE(zhiqiu): On windows with msvc compiler, unsigned long (UL) is 32bits,
+  // so 4UL * 1024 * 1024 * 1024 becomes 0.
+  // We need to use 4ULL (unsigned long long) here.
+  StubAllocation stub(4ULL * 1024 * 1024 * 1024);
   BestFitAllocator allocator(&stub);
   { auto allocation = allocator.Allocate(64); }
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index c5b9d88433af9..0fbbf405f0bf1 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -17,6 +17,9 @@ limitations under the License. */
 
 #ifdef _WIN32
 #include <malloc.h>
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
 #include <windows.h>  // VirtualLock/VirtualUnlock
 #else
 #include <sys/mman.h>  // for mlock and munlock
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 204f854a380ab..b9a92c2207d8e 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -250,6 +250,20 @@ Sine Activation Operator.
 
 )DOC";
 
+UNUSED constexpr char SinhDoc[] = R"DOC(
+Sinh Activation Operator.
+
+$$out = sinh(x)$$
+
+)DOC";
+
+UNUSED constexpr char CoshDoc[] = R"DOC(
+Cosh Activation Operator.
+
+$$out = cosh(x)$$
+
+)DOC";
+
 UNUSED constexpr char RoundDoc[] = R"DOC(
 The OP rounds the values in the input to the nearest integer value.
 
@@ -642,6 +656,8 @@ REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc);
 REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc);
 REGISTER_ACTIVATION_OP_MAKER(Cos, CosDoc);
 REGISTER_ACTIVATION_OP_MAKER(Sin, SinDoc);
+REGISTER_ACTIVATION_OP_MAKER(Sinh, SinhDoc);
+REGISTER_ACTIVATION_OP_MAKER(Cosh, CoshDoc);
 REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc);
 REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index b3784ed074409..3aac7ae8a5e8a 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -584,6 +584,72 @@ struct SinFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct Sinh {
+  HOSTDEVICE T operator()(const T& val) const { return sinh(val); }
+};
+
+template <>
+struct Sinh<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(sinhf(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosh {
+  HOSTDEVICE T operator()(const T& val) const { return cosh(val); }
+};
+
+template <>
+struct Cosh<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(coshf(static_cast<float>(val)));
+  }
+};
+
+// sinh(x) = sinh(x)
+template <typename T>
+struct SinhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sinh<T>());
+  }
+};
+
+// cosh(x) = cosh(x)
+template <typename T>
+struct CoshFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosh<T>());
+  }
+};
+
+// sinh'(x) = cosh(x)
+template <typename T>
+struct SinhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// cosh'(x) = sinh(x)
+template <typename T>
+struct CoshGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Sinh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct Acos {
   HOSTDEVICE T operator()(const T& val) const { return acos(val); }
@@ -1752,6 +1818,8 @@ class PowGradKernel
   __macro(acos, Acos, AcosFunctor, AcosGradFunctor);                          \
   __macro(sin, Sin, SinFunctor, SinGradFunctor);                              \
   __macro(asin, Asin, AsinFunctor, AsinGradFunctor);                          \
+  __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor);                          \
+  __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor);                          \
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
   __macro(log, Log, LogFunctor, LogGradFunctor);                              \
diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
new file mode 100644
index 0000000000000..b742b4c0deea8
--- /dev/null
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -0,0 +1,194 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/bilateral_slice_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+class BilateralSliceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BilateralSlice");
+    OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "BilateralSlice");
+    OP_INOUT_CHECK(ctx->HasInput("Guide"), "Input", "Guide", "BilateralSlice");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Output", "BilateralSlice");
+
+    auto dim_x = ctx->GetInputDim("X");  // NCHW format
+    PADDLE_ENFORCE_EQ(
+        dim_x.size(), 4,
+        platform::errors::Unimplemented(
+            "Input(X) dimension must be 4, but got dimension = %d .",
+            dim_x.size()));
+
+    auto input_dims = ctx->GetInputDim("X");
+    auto grid_dims = ctx->GetInputDim("Grid");
+    auto guide_dims = ctx->GetInputDim("Guide");
+    bool has_offset = ctx->Attrs().Get<bool>("has_offset");
+    int64_t h = guide_dims[1];
+    int64_t w = guide_dims[2];
+    int64_t bs = grid_dims[0];
+    int64_t coeffs_chans = grid_dims[1];
+    int64_t input_chans = input_dims[1];
+
+    int64_t output_chans;
+    if (has_offset) {
+      PADDLE_ENFORCE_EQ((coeffs_chans % (input_chans + 1)), 0,
+                        platform::errors::InvalidArgument(
+                            "Slicing with affine offset, coefficients grid "
+                            "should have n_out*(n_in+1) channels, but got %d",
+                            coeffs_chans));
+      output_chans = coeffs_chans / (input_chans + 1);
+    } else {
+      PADDLE_ENFORCE_EQ((coeffs_chans % input_chans), 0,
+                        platform::errors::InvalidArgument(
+                            "Slicing without affine offset, coefficients grid "
+                            "should have n_out*n_in channels, but got %d .",
+                            coeffs_chans));
+      output_chans = coeffs_chans / input_chans;
+    }
+
+    std::vector<int64_t> output_dims;
+    output_dims.push_back(bs);
+    output_dims.push_back(output_chans);
+    output_dims.push_back(h);
+    output_dims.push_back(w);
+
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class BilateralSliceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of bilateral_slice operator, "
+             "This is a 4-D tensor with shape of [N, C, H, W]");
+    AddInput("Grid",
+             "This is a 5-D tensor. "
+             "It should be [N, C, D, H, W].");
+    AddInput("Guide",
+             "This is a 3-D tensor "
+             "It should be [N, H, W].");
+    AddOutput("Out",
+              "The output tensor of bilateral slice operator, "
+              "This is a tensor in same rank with Input(X).");
+    AddAttr<bool>("has_offset", "an optional bool. Defaults to False. ")
+        .SetDefault(false);
+    AddComment(R"DOC(
+          This operator enhance input X according guide and grid
+          For details of bilateral slice, please refer to paper:
+          https://groups.csail.mit.edu/graphics/hdrnet/
+         )DOC");
+  }
+};
+
+class BilateralSliceOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BilateralSliceOpGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid",
+                   "BilateralSliceOpGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Guide"), "Input", "Guide",
+                   "BilateralSliceOpGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", "Out",
+                   "BilateralSliceOpGrad");
+
+    auto dim_x = ctx->GetInputDim("X");
+    auto dim_grid = ctx->GetInputDim("Grid");
+    auto dim_guide = ctx->GetInputDim("Guide");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
+      ctx->SetOutputDim(framework::GradVarName("Grid"), dim_grid);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Guide"))) {
+      ctx->SetOutputDim(framework::GradVarName("Guide"), dim_guide);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class BilateralSliceGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Grid", this->Input("Grid"));
+    op->SetInput("Guide", this->Input("Guide"));
+
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Grid"), this->InputGrad("Grid"));
+    op->SetOutput(framework::GradVarName("Guide"), this->InputGrad("Guide"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+template <typename T>
+class BilateralSliceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::Unimplemented(
+                          "BilateralSlice only supports GPU now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bilateral_slice, ops::BilateralSliceOp,
+                  ops::BilateralSliceOpMaker,
+                  ops::BilateralSliceGradMaker<paddle::framework::OpDesc>,
+                  ops::BilateralSliceGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(bilateral_slice_grad, ops::BilateralSliceOpGrad);
+REGISTER_OP_CPU_KERNEL(bilateral_slice, ops::BilateralSliceKernel<float>,
+                       ops::BilateralSliceKernel<double>);
diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
new file mode 100644
index 0000000000000..e46950f61887d
--- /dev/null
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -0,0 +1,506 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/operators/bilateral_slice_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+__device__ T DiffAbs(T x) {
+  T eps = 1e-8;
+  return sqrt(x * x + eps);
+}
+
+template <typename T>
+__device__ T DdiffAbs(T x) {
+  T eps = 1e-8;
+  return x / sqrt(x * x + eps);
+}
+
+template <typename T>
+__device__ T WeightZ(T x) {
+  T abx = DiffAbs(x);
+  return max(1.0f - abx, 0.0f);
+}
+
+template <typename T>
+__device__ T DweightZ(T x) {
+  T abx = DiffAbs(x);
+  if (abx > 1.0f) {
+    return 0.0f;
+  } else {
+    return DdiffAbs(x);
+  }
+}
+
+template <typename T>
+__global__ void BilateralSliceCudaForwardKernel(
+    T* output, const T* bilateral_grid, const T* guide, const T* input,
+    GridSizes gsz, bool has_offset, int total_count, int output_chans) {
+  int h = gsz.h;
+  int w = gsz.w;
+  int gd = gsz.gd;
+  int gh = gsz.gh;
+  int gw = gsz.gw;
+  int input_chans = gsz.input_chans;
+  int coeff_stride = input_chans;
+  int grid_chans = input_chans * output_chans;
+
+  if (has_offset) {
+    grid_chans += output_chans;
+    coeff_stride += 1;
+  }
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_count;
+       idx += blockDim.x * gridDim.x) {
+    int x = idx % w;
+    int y = (idx / w) % h;
+    int out_c = (idx / (h * w)) % output_chans;
+    int b = (idx / (output_chans * w * h));
+
+    T gx = (x + 0.5f) * gw / (1.0f * w);
+    T gy = (y + 0.5f) * gh / (1.0f * h);
+    T gz = guide[x + w * (y + h * b)] * gd;
+
+    int fx = static_cast<int>(floor(gx - 0.5f));
+    int fy = static_cast<int>(floor(gy - 0.5f));
+    int fz = static_cast<int>(floor(gz - 0.5f));
+
+    int sy = gw;
+    int sz = gw * gh;
+    int sc = gd * gw * gh;
+    int sb = grid_chans * gd * gw * gh;
+
+    T value = 0.0f;
+    for (int in_c = 0; in_c < coeff_stride; ++in_c) {
+      T coeff_sample = 0.0f;
+
+      for (int xx = fx; xx < fx + 2; ++xx) {
+        int x_ = max(min(xx, gw - 1), 0);
+        T wx = max(1.0f - abs(xx + 0.5 - gx), 0.0f);
+
+        for (int yy = fy; yy < fy + 2; ++yy) {
+          int y_ = max(min(yy, gh - 1), 0);
+          T wy = max(1.0f - abs(yy + 0.5 - gy), 0.0f);
+
+          for (int zz = fz; zz < fz + 2; ++zz) {
+            int z_ = max(min(zz, gd - 1), 0);
+            T wz = WeightZ(zz + 0.5 - gz);
+            int c_ = coeff_stride * out_c + in_c;
+            int grid_idx = x_ + sy * y_ + sz * z_ + sc * c_ + sb * b;
+
+            coeff_sample += bilateral_grid[grid_idx] * wx * wy * wz;
+          }
+        }
+      }
+      if (in_c < input_chans) {
+        int input_idx = x + w * (y + h * (in_c + input_chans * b));
+        value += coeff_sample * input[input_idx];
+      } else {
+        value += coeff_sample;
+      }
+    }
+
+    output[idx] = value;
+  }
+}
+
+template <typename T>
+class BilateralSliceOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* guide = ctx.Input<Tensor>("Guide");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto* grid_data = grid->data<T>();
+    auto* guide_data = guide->data<T>();
+    auto* input_data = input->data<T>();
+
+    bool has_offset = ctx.Attr<bool>("has_offset");
+    auto input_dims = input->dims();
+    auto output_dims = output->dims();
+    auto grid_dims = grid->dims();
+
+    int batch_size = input_dims[0];
+    int h = input_dims[2];
+    int w = input_dims[3];
+    int input_chans = input_dims[1];
+    int coeff_stride = input_chans;
+    int grid_chans = input_chans * output_dims[1];
+
+    int64_t coeffs_chans = grid_dims[1];
+    int64_t gd = grid_dims[2];
+    int64_t gh = grid_dims[3];
+    int64_t gw = grid_dims[4];
+
+    GridSizes grid_sizes;
+    grid_sizes.h = h;
+    grid_sizes.w = w;
+    grid_sizes.bs = batch_size;
+    grid_sizes.coeffs_chans = coeffs_chans;
+    grid_sizes.gd = gd;
+    grid_sizes.gh = gh;
+    grid_sizes.gw = gw;
+    grid_sizes.input_chans = input_chans;
+
+    int total_count = batch_size * h * w * output_dims[1];
+
+    platform::GpuLaunchConfig config =
+        platform::getGpuLaunchConfig(total_count, ctx);
+
+    BilateralSliceCudaForwardKernel<T><<<config.blocks, config.threads, 0,
+                                         ctx.cuda_device_context().stream()>>>(
+        output_data, grid_data, guide_data, input_data, grid_sizes, has_offset,
+        total_count, output_dims[1]);
+  }
+};
+
+template <typename T>
+__global__ void BilateralSliceCudaGridGradKernel(
+    T* out_grid_grad, const T* upstream_grad, const T* guide, const T* input,
+    GridSizes gsz, bool has_offset, int grid_count, int output_chans) {
+  int h = gsz.h;
+  int w = gsz.w;
+  int gd = gsz.gd;
+  int gh = gsz.gh;
+  int gw = gsz.gw;
+  int input_chans = gsz.input_chans;
+  int grid_chans = input_chans * output_chans;
+  int coeff_stride = input_chans;
+
+  if (has_offset) {
+    grid_chans += output_chans;
+    coeff_stride += 1;
+  }
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < grid_count;
+       idx += blockDim.x * gridDim.x) {
+    int gx = idx % gw;
+    int gy = (idx / gw) % gh;
+    int gz = (idx / (gh * gw)) % gd;
+    int c = (idx / (gd * gh * gw)) % grid_chans;
+    int b = (idx / (grid_chans * gd * gw * gh));
+
+    T scale_w = w * 1.0 / gw;
+    T scale_h = h * 1.0 / gh;
+
+    int left_x = static_cast<int>(floor(scale_w * (gx + 0.5 - 1)));
+    int right_x = static_cast<int>(ceil(scale_w * (gx + 0.5 + 1)));
+    int left_y = static_cast<int>(floor(scale_h * (gy + 0.5 - 1)));
+    int right_y = static_cast<int>(ceil(scale_h * (gy + 0.5 + 1)));
+
+    int sy = w;
+    int sc = w * h;
+    int sb = output_chans * w * h;
+
+    int isy = w;
+    int isc = h * w;
+    int isb = input_chans * h * w;
+
+    int out_c = c / coeff_stride;
+    int in_c = c % coeff_stride;
+
+    T value = 0.0f;
+    for (int x = left_x; x < right_x; ++x) {
+      int x_ = x;
+
+      if (x_ < 0) {
+        x_ = -x_ - 1;
+      }
+      if (x_ >= w) {
+        x_ = 2 * w - 1 - x_;
+      }
+
+      T gx2 = (x + 0.5f) / scale_w;
+      T wx = max(1.0f - abs(gx + 0.5 - gx2), 0.0f);
+
+      for (int y = left_y; y < right_y; ++y) {
+        int y_ = y;
+
+        if (y_ < 0) {
+          y_ = -y_ - 1;
+        }
+        if (y_ >= h) {
+          y_ = 2 * h - 1 - y_;
+        }
+
+        T gy2 = (y + 0.5f) / scale_h;
+        T wy = max(1.0f - abs(gy + 0.5 - gy2), 0.0f);
+
+        int guide_idx = x_ + w * y_ + h * w * b;
+        T gz2 = guide[guide_idx] * gd;
+        T wz = WeightZ(gz + 0.5f - gz2);
+        if (((gz == 0) && (gz2 < 0.5f)) ||
+            ((gz == (gd - 1)) && (gz2 > (gd - 0.5f)))) {
+          wz = 1.0f;
+        }
+
+        int back_idx = x_ + sy * y_ + sc * out_c + sb * b;
+        if (in_c < input_chans) {
+          int input_idx = x_ + isy * y_ + isc * in_c + isb * b;
+          value += wz * wx * wy * upstream_grad[back_idx] * input[input_idx];
+        } else {
+          value += wz * wx * wy * upstream_grad[back_idx];
+        }
+      }
+    }
+    out_grid_grad[idx] = value;
+  }
+}
+
+template <typename T>
+__global__ void BilateralSliceCudaGuideGradKernel(
+    T* out_guide_grad, const T* upstream_grad, const T* bilateral_grid,
+    const T* guide, const T* input, GridSizes gsz, bool has_offset,
+    int guide_count, int output_chans) {
+  int h = gsz.h;
+  int w = gsz.w;
+  int gd = gsz.gd;
+  int gh = gsz.gh;
+  int gw = gsz.gw;
+  int input_chans = gsz.input_chans;
+  int grid_chans = input_chans * output_chans;
+  int coeff_stride = input_chans;
+
+  if (has_offset) {
+    grid_chans += output_chans;
+    coeff_stride += 1;
+  }
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < guide_count;
+       idx += blockDim.x * gridDim.x) {
+    int x = idx % w;
+    int y = (idx / w) % h;
+    int b = (idx / (w * h));
+
+    T gx = (x + 0.5f) * gw / (1.0f * w);
+    T gy = (y + 0.5f) * gh / (1.0f * h);
+    T gz = guide[x + w * (y + h * b)] * gd;
+
+    int fx = static_cast<int>(floor(gx - 0.5f));
+    int fy = static_cast<int>(floor(gy - 0.5f));
+    int fz = static_cast<int>(floor(gz - 0.5f));
+
+    int sy = gw;
+    int sz = gh * gw;
+    int sc = gd * gh * gw;
+    int sb = grid_chans * gd * gw * gh;
+
+    T out_sum = 0.0f;
+    for (int out_c = 0; out_c < output_chans; ++out_c) {
+      T in_sum = 0.0f;
+      for (int in_c = 0; in_c < coeff_stride; ++in_c) {
+        T grid_sum = 0.0f;
+        for (int xx = fx; xx < fx + 2; ++xx) {
+          int x_ = max(min(xx, gw - 1), 0);
+          T wx = max(1.0f - abs(xx + 0.5 - gx), 0.0f);
+
+          for (int yy = fy; yy < fy + 2; ++yy) {
+            int y_ = max(min(yy, gh - 1), 0);
+            T wy = max(1.0f - abs(yy + 0.5 - gy), 0.0f);
+
+            for (int zz = fz; zz < fz + 2; ++zz) {
+              int z_ = max(min(zz, gd - 1), 0);
+              T dwz = gd * DweightZ(zz + 0.5 - gz);
+
+              int c_ = coeff_stride * out_c + in_c;
+              int grid_idx = x_ + sy * y_ + sz * z_ + sc * c_ + sb * b;
+              grid_sum += bilateral_grid[grid_idx] * wx * wy * dwz;
+            }
+          }
+        }
+
+        if (in_c < input_chans) {
+          in_sum +=
+              grid_sum * input[x + w * (y + h * (in_c + input_chans * b))];
+        } else {
+          in_sum += grid_sum;
+        }
+      }
+
+      out_sum +=
+          in_sum * upstream_grad[x + w * (y + h * (out_c + output_chans * b))];
+    }
+
+    out_guide_grad[idx] = out_sum;
+  }
+}
+
+template <typename T>
+__global__ void BilateralSliceCudaInputGradKernel(
+    T* out_input_grad, const T* upstream_grad, const T* bilateral_grid,
+    const T* guide, GridSizes gsz, bool has_offset, int input_count,
+    int output_chans) {
+  int h = gsz.h;
+  int w = gsz.w;
+  int gd = gsz.gd;
+  int gh = gsz.gh;
+  int gw = gsz.gw;
+  int input_chans = gsz.input_chans;
+  int grid_chans = input_chans * output_chans;
+  int coeff_stride = input_chans;
+
+  if (has_offset) {
+    grid_chans += output_chans;
+    coeff_stride += 1;
+  }
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < input_count;
+       idx += blockDim.x * gridDim.x) {
+    int x = idx % w;
+    int y = (idx / w) % h;
+    int in_c = (idx / (h * w)) % input_chans;
+    int b = (idx / (input_chans * w * h));
+
+    T gx = (x + 0.5f) * gw / (1.0f * w);
+    T gy = (y + 0.5f) * gh / (1.0f * h);
+    T gz = guide[x + w * (y + h * b)] * gd;
+
+    int fx = static_cast<int>(floor(gx - 0.5f));
+    int fy = static_cast<int>(floor(gy - 0.5f));
+    int fz = static_cast<int>(floor(gz - 0.5f));
+
+    int sy = gw;
+    int sz = gh * gw;
+    int sc = gd * gh * gw;
+    int sb = grid_chans * gd * gh * gw;
+
+    T value = 0.0f;
+    for (int out_c = 0; out_c < output_chans; ++out_c) {
+      T chan_val = 0.0f;
+
+      for (int xx = fx; xx < fx + 2; ++xx) {
+        int x_ = max(min(xx, gw - 1), 0);
+        T wx = max(1.0f - abs(xx + 0.5 - gx), 0.0f);
+
+        for (int yy = fy; yy < fy + 2; ++yy) {
+          int y_ = max(min(yy, gh - 1), 0);
+          T wy = max(1.0f - abs(yy + 0.5 - gy), 0.0f);
+
+          for (int zz = fz; zz < fz + 2; ++zz) {
+            int z_ = max(min(zz, gd - 1), 0);
+
+            T wz = WeightZ(zz + 0.5 - gz);
+
+            int c_ = coeff_stride * out_c + in_c;
+            int grid_idx = x_ + sy * y_ + sz * z_ + sc * c_ + sb * b;
+            chan_val += bilateral_grid[grid_idx] * wx * wy * wz;
+          }
+        }
+      }
+
+      value += chan_val *
+               upstream_grad[x + w * (y + h * (out_c + output_chans * b))];
+    }
+    out_input_grad[idx] = value;
+  }
+}
+
+template <typename T>
+class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* guide = ctx.Input<Tensor>("Guide");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+    auto* guide_grad = ctx.Output<Tensor>(framework::GradVarName("Guide"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    const T* input_data = input->data<T>();
+    const T* guide_data = guide->data<T>();
+    const T* grid_data = grid->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+
+    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    T* guide_grad_data = guide_grad->mutable_data<T>(ctx.GetPlace());
+    T* grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
+
+    bool has_offset = ctx.Attr<bool>("has_offset");
+
+    auto input_grad_dims = input_grad->dims();
+    auto grid_dims = grid_grad->dims();
+
+    int batch_size = input_grad_dims[0];
+    int h = input_grad_dims[2];
+    int w = input_grad_dims[3];
+    int input_chans = input_grad_dims[1];
+
+    int64_t coeffs_chans = grid_dims[1];
+    int64_t gd = grid_dims[2];
+    int64_t gh = grid_dims[3];
+    int64_t gw = grid_dims[4];
+
+    int output_chans = 0;
+    if (has_offset) {
+      output_chans = coeffs_chans / (input_chans + 1);
+    } else {
+      output_chans = coeffs_chans / input_chans;
+    }
+    int grid_count = batch_size * gh * gw * gd * coeffs_chans;
+    int guide_count = batch_size * h * w;
+    int input_count = batch_size * h * w * input_chans;
+
+    GridSizes grid_sizes;
+    grid_sizes.h = h;
+    grid_sizes.w = w;
+    grid_sizes.bs = batch_size;
+    grid_sizes.coeffs_chans = coeffs_chans;
+    grid_sizes.gd = gd;
+    grid_sizes.gh = gh;
+    grid_sizes.gw = gw;
+    grid_sizes.input_chans = input_chans;
+
+    platform::GpuLaunchConfig config =
+        platform::getGpuLaunchConfig(grid_count, ctx, 512);
+
+    BilateralSliceCudaGridGradKernel<T><<<config.blocks, config.threads, 0,
+                                          ctx.cuda_device_context().stream()>>>(
+        grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
+        has_offset, grid_count, output_chans);
+
+    config = platform::getGpuLaunchConfig(guide_count, ctx, 512);
+
+    BilateralSliceCudaGuideGradKernel<T><<<
+        config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
+        guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
+        grid_sizes, has_offset, guide_count, output_chans);
+
+    config = platform::getGpuLaunchConfig(input_count, ctx, 512);
+
+    BilateralSliceCudaInputGradKernel<T><<<
+        config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
+        input_grad_data, output_grad_data, grid_data, guide_data, grid_sizes,
+        has_offset, input_count, output_chans);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(bilateral_slice, ops::BilateralSliceOpCUDAKernel<float>,
+                        ops::BilateralSliceOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(bilateral_slice_grad,
+                        ops::BilateralSliceGradOpCUDAKernel<float>,
+                        ops::BilateralSliceGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/bilateral_slice_op.h b/paddle/fluid/operators/bilateral_slice_op.h
new file mode 100644
index 0000000000000..0903fe4c71d3d
--- /dev/null
+++ b/paddle/fluid/operators/bilateral_slice_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+struct GridSizes {
+  int64_t h;
+  int64_t w;
+  int64_t bs;
+  int64_t coeffs_chans;
+  int64_t gd;
+  int64_t gh;
+  int64_t gw;
+  int64_t input_chans;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index c9dcda1adb3f7..bb72174be5ed5 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -51,7 +51,7 @@ static inline framework::DDim ComputeAndCheckShape(
         }
       } else {
         bool check_shape =
-            is_runtime || (out_dims[j] > 0 && inputs_dims[i][j] > 0);
+            is_runtime || (inputs_dims[0][j] > 0 && inputs_dims[i][j] > 0);
         if (check_shape) {
           // check all shape in run time
           PADDLE_ENFORCE_EQ(inputs_dims[0][j], inputs_dims[i][j],
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index e1cecb0a049a5..74589dcb6a74c 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -24,12 +24,12 @@ class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     OpComment comment;
     AddInput("X", string::Sprintf("Left hand operand of %s operator. Must be "
-                                  "a LoDTensor or Tensor of type bool.",
+                                  "a Variable of type bool.",
                                   comment.type));
     AddInput("Y", string::Sprintf("Right hand operand of %s operator. Must be "
-                                  "a LoDTensor or Tensor of type bool.",
+                                  "a Variable of type bool.",
                                   comment.type));
-    AddOutput("Out", string::Sprintf("n-dim bool LoDTensor or Tensor"));
+    AddOutput("Out", string::Sprintf("n-dim bool Variable"));
     AddComment(string::Sprintf(R"DOC(%s Operator
 
 It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 32eeae9a0145e..f6c8316e2e9fa 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -183,6 +183,10 @@ void FusionGRUOpMaker::Make() {
                 "(bool, default: True) "
                 "whether to use seq mode to compute GRU.")
       .SetDefault(true);
+  AddAttr<bool>("origin_mode",
+                "bool"
+                "use origin mode in article https://arxiv.org/abs/1412.3555")
+      .SetDefault(false);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
 This operator fuse the fully-connected operator into GRU, 
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index 3b48615338f72..a920bf7c3f505 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -42,6 +42,7 @@ class LiteEngineOp : public framework::OperatorBase {
   paddle::lite::Predictor *engine_;
   framework::proto::VarType::Type precision_;
   bool use_gpu_;
+  bool zero_copy_;
 
  public:
   LiteEngineOp(const std::string &type,
@@ -60,6 +61,7 @@ class LiteEngineOp : public framework::OperatorBase {
       precision_ = framework::proto::VarType_Type_FP32;
     }
     use_gpu_ = Attr<bool>("use_gpu");
+    zero_copy_ = Attr<bool>("zero_copy");
   }
 
  protected:
@@ -73,13 +75,13 @@ class LiteEngineOp : public framework::OperatorBase {
     const platform::DeviceContext *ctx =
         platform::DeviceContextPool::Instance().Get(dev_place);
     for (size_t i = 0; i < in_names_.size(); i++) {
-      const framework::LoDTensor &src_t =
+      framework::LoDTensor src_t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope,
                                                                   in_names_[i]);
       paddle::lite::Tensor *dst_t = engine_->GetInput(i);
-      VLOG(3) << "[Copy] fluid -> lite (" << in_names_[i] << " -> "
+      VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
               << engine_->GetInputNames()[i] << ")";
-      inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx);
+      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
     }
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(dev_place)) {
@@ -91,13 +93,13 @@ class LiteEngineOp : public framework::OperatorBase {
     engine_->Run();
     VLOG(3) << "lite engine run done";
     for (size_t i = 0; i < out_names_.size(); i++) {
-      const paddle::lite::Tensor &src_t = *(engine_->GetOutput(i));
+      paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
       framework::LoDTensor *dst_t =
           &inference::analysis::GetFromScope<framework::LoDTensor>(
               scope, out_names_[i]);
-      VLOG(3) << "[Copy] lite -> fluid (" << out_names_[i] << " -> "
+      VLOG(3) << "== lite -> fluid (" << out_names_[i] << " -> "
               << engine_->GetOutputNames()[i] << ")";
-      inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx);
+      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
     }
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(dev_place)) {
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index 3812911e915bc..fb5c0dcb3514d 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -100,6 +100,7 @@ TEST(LiteEngineOp, engine_op) {
   engine_op_desc.SetAttr("engine_key", engine_key);
   engine_op_desc.SetAttr("enable_int8", false);
   engine_op_desc.SetAttr("use_gpu", true);
+  engine_op_desc.SetAttr("zero_copy", true);
   engine_op_desc.SetBlockAttr("sub_block", &block_desc);
   inference::Singleton<inference::lite::EngineManager>::Global().Create(
       engine_key, config);
diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h
index 02a1a4150d01a..f83b2a1a85c4f 100644
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/platform/errors.h"
 
 namespace paddle {
 namespace inference {
@@ -98,7 +99,7 @@ void CreateTensor(framework::Scope* scope, const std::string& name,
 #ifdef PADDLE_WITH_CUDA
     place = platform::CUDAPlace(0);
 #else
-    PADDLE_THROW(platform::errors::PreconditionNetMet(
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
         "You must define PADDLE_WITH_CUDA for using CUDAPlace."));
 #endif
   } else {
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 5c98eab403096..b3b0f8f196090 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -105,17 +105,17 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
     auto *table = table_t->data<T>();
     auto *output = output_t->mutable_data<T>(context.GetPlace());
 
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
+    dim3 threads(256, 4);
+    dim3 grids(80, 1);
 
     if (padding_idx == -1)
       LookupTableV2<
-          T, 128, 8, 8,
+          T, 256, 4, 80,
           false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
           output, table, ids, N, K, D, padding_idx);
     else
       LookupTableV2<
-          T, 128, 8, 8,
+          T, 256, 4, 80,
           true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
           output, table, ids, N, K, D, padding_idx);
   }
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 39bddda6caa53..64b35cfeaecd1 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -428,7 +428,8 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   const int64_t strideC = M * N;
 
 #if CUDA_VERSION >= 9010
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
+      std::is_same<T, paddle::platform::float16>::value) {
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
     bool use_tensor_op_math = context_.tensor_core_available();
     if (use_tensor_op_math) {
@@ -437,11 +438,11 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
     VLOG(5) << "use_tensor_op_math: "
             << (use_tensor_op_math ? "True" : "False");
 
+    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
     context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
-          handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb,
-          strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc,
-          strideC, batchCount, CUDA_R_32F, algo));
+          handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A,
+          fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo));
     });
   } else {
 #endif  // CUDA_VERSION >= 9010
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 7f507999fda0e..22e5256335c73 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -27,6 +27,9 @@ limitations under the License. */
 
 #if defined(_WIN32)
 #include <intrin.h>
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
 #include <windows.h>
 #endif  // _WIN32
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index ac6ddebb813fa..17e1e19583461 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -943,7 +943,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key = platform::CreateKey(
         src_tz, ctx.InputName("Input") + ctx.InputName("Filter"));
 
-    const std::string key_conv_pd = key + "@forward_pd";
+    const std::string key_conv_pd = key + "@fwd_pd";
     std::vector<primitive> pipeline;
 
     // Create user memory descriptors
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 830334043c4d7..c0fbc336e46b6 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -232,10 +232,15 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 
     auto exe_ctx = exe.Prepare(*program, 0, skip_vars);
 
-    // get scope and clear old vars
-    framework::Scope &scope = *(out_scope_vec->front());
-    auto local_vars = scope.LocalVarNames();
-    scope.EraseVars(local_vars);
+    // NOTE(Aurelius84): While training some models, forward can be called many
+    // times and then apply backpropagation all at once, such as Reinforcement
+    // Learning. Tensor data in multi-step training should be saved into single
+    // scope separately. Otherwise, the gradients can be miscalculated because
+    // always using the Tensor data of the last step in forward.
+    framework::Scope *global_inner_scope = out_scope_vec->front();
+    VLOG(2) << "The number of sub scopes before forward: "
+            << out_scope_vec->front()->kids().size();
+    framework::Scope &scope = global_inner_scope->NewScope();
 
     // share input_vars & parameters into scope
     details::ShareVarsIntoScope(input_vars, input_var_names, &scope);
@@ -251,6 +256,12 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 
     // Debug info: scope info when run end
     VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+    // Step 5. Drop all children scopes while testing.
+    if (is_test) {
+      out_scope_vec->front()->DropKids();
+    }
+    VLOG(2) << "The number of sub scopes after forward: "
+            << out_scope_vec->front()->kids().size();
   }
 };
 
@@ -285,8 +296,8 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
 
     auto orig_end_op_index = ctx.Attr<int64_t>("end_op_index");
     // NOTE: skip `shape` and `fill_constant` op created by
-    // fluid.backward.gradients,
-    // one forward output will generate one `shape` and `fill_constant`
+    // fluid.backward.gradients, one forward output will generate one `shape`
+    // and `fill_constant`
     int64_t start_op_index = orig_end_op_index + (output_grad_vars.size() * 2);
     int64_t end_op_index = block->OpSize();
 
@@ -295,7 +306,16 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
         out_scope_vec->size(), 1,
         platform::errors::InvalidArgument(
             "The OutScope of RunProgramGradOp should only hold one scope."));
-    auto &scope = *(out_scope_vec->front());
+
+    framework::Scope *global_inner_scope = out_scope_vec->front();
+    auto sub_scope_num = global_inner_scope->kids().size();
+    VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
+    PADDLE_ENFORCE_GT(sub_scope_num, 0,
+                      platform::errors::InvalidArgument(
+                          "The OutScope of RunProgramGradOp should hold at "
+                          "least one sub scope."));
+
+    auto &scope = *(global_inner_scope->kids().front());
 
     // Step 2. prepare executor and scope
     framework::Executor exe(ctx.GetPlace());
@@ -324,6 +344,11 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
     // Step 4. get outputs
     details::ShareVarsFromScope(input_grad_vars, input_grad_var_names, &scope);
     details::ShareVarsFromScope(param_grad_vars, param_grad_names, &scope);
+
+    // Step5. drop current scope
+    global_inner_scope->DeleteScope(&scope);
+    VLOG(2) << "The number of sub scopes after backward: "
+            << global_inner_scope->kids().size();
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 050ab2c9418f6..8a3bb5318cb3b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -180,7 +180,10 @@ REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp,
                   ops::SequencePoolGradOpNoNeedBufferVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     sequence_pool,
-    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, double>);
+
 REGISTER_OP_CPU_KERNEL(
     sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
index 8b5d859a8d315..63420ee30e446 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
@@ -24,24 +24,30 @@ class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROW"), true,
-                      "Input(ROW) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("COLUMN"), true,
-                      "Input(COLUMN) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("pos"), true,
-                      "pos(out) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SequenceTopkAvgPooling");
+    OP_INOUT_CHECK(ctx->HasInput("ROW"), "Input", "ROW",
+                   "SequenceTopkAvgPooling");
+    OP_INOUT_CHECK(ctx->HasInput("COLUMN"), "Input", "COLUMN",
+                   "SequenceTopkAvgPooling");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "SequenceTopkAvgPooling");
+    OP_INOUT_CHECK(ctx->HasOutput("pos"), "Output", "pos",
+                   "SequenceTopkAvgPooling");
 
     auto attr = ctx->Attrs();
     auto channel_num = attr.Get<int>("channel_num");
+    PADDLE_ENFORCE_GT(
+        channel_num, 0,
+        platform::errors::InvalidArgument(
+            "Expected channel_num > 0, but received %d.", channel_num));
+
     auto topks = attr.Get<std::vector<int>>("topks");
+    auto num_k = topks.size();
+    PADDLE_ENFORCE_GT(
+        num_k, 0, platform::errors::InvalidArgument(
+                      "Expected topks.size() > 0, but received %zu.", num_k));
 
     auto row_dim = ctx->GetInputDim("ROW");
-
-    auto num_k = topks.size();
     auto row_shape_0 = row_dim[0];
 
     std::vector<int> vec_out_shape;
@@ -49,7 +55,7 @@ class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
     vec_out_shape.push_back(channel_num * num_k);
 
     ctx->SetOutputDim("Out", framework::make_ddim(vec_out_shape));
-    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("ROW", "Out");
   }
 };
 
@@ -78,10 +84,10 @@ class SequenceTopkAvgPoolingGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Gradient of Out should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "The input X should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "SequenceTopkAvgPoolingGrad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "SequenceTopkAvgPoolingGrad");
 
     ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
     ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
index 2cb70ee736d38..e8e0241e46ad2 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -13,52 +13,57 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <functional>
 #include <limits>
+#include <queue>
 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+static constexpr int TopKPosPaddingId = -1;
+
+namespace details {
+
 template <typename T>
-void get_topk_pos(const T* data, int length, int k, int* pos) {
-  size_t real_k = k < length ? k : length;
-
-  std::vector<T> v(data, data + length);
-
-  std::vector<int> topk_pos;
-  T min_val = std::numeric_limits<T>::lowest();
-  while (topk_pos.size() < real_k) {
-    T max_val = min_val;
-    int max_pos = -1;
-    for (int i = 0; i < length; ++i) {
-      if (v[i] > max_val) {
-        max_pos = i;
-        max_val = v[i];
+static void get_topk_pos(const T* data, int length, int k, int* pos) {
+  VLOG(3) << "length: " << length << " , k : " << k;
+
+  std::priority_queue<std::pair<T, int>, std::vector<std::pair<T, int>>,
+                      std::greater<std::pair<T, int>>>
+      topk_queue;
+
+  for (int i = 0; i < length; ++i) {
+    T elem = data[i];
+    if (topk_queue.size() < static_cast<size_t>(k)) {
+      topk_queue.emplace(elem, i);
+    } else {
+      if (elem >= topk_queue.top().first) {
+        // replace top node if found a bigger value
+        topk_queue.pop();
+        topk_queue.emplace(elem, i);
       }
     }
-
-    assert(max_pos >= 0);
-
-    topk_pos.push_back(max_pos);
-    v[max_pos] = min_val;
   }
-
-  assert(topk_pos.size() > 0);
-  while (topk_pos.size() < (size_t)k) {
-    topk_pos.push_back(-1);
+  // reversely assign value
+  int real_k = topk_queue.size();
+  for (int i = real_k - 1; i >= 0; --i) {
+    pos[i] = topk_queue.top().second;
+    topk_queue.pop();
   }
-
-  for (size_t i = 0; i < topk_pos.size(); ++i) {
-    pos[i] = topk_pos[i];
+  // if length of data is less than k, fill TopKPosPaddingId at the end of pos.
+  for (int i = real_k; i < k; ++i) {
+    pos[i] = TopKPosPaddingId;
   }
 }
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
+}  // namespace details
 
 template <typename DeviceContext, typename T>
 class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
@@ -70,20 +75,29 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
     auto* out = context.Output<LoDTensor>("Out");
     auto* pos = context.Output<Tensor>("pos");
 
-    PADDLE_ENFORCE_EQ(in->lod().empty(), false,
-                      "Input(X) Tensor of SequenceTopkAvgPoolingOp does not "
-                      "contain LoD information.");
-    PADDLE_ENFORCE_EQ(row->lod().empty(), false,
-                      "Input(ROW) Tensor of SequenceTopkAvgPoolingOp does not "
-                      "contain LoD information.");
-    PADDLE_ENFORCE_EQ(col->lod().empty(), false,
-                      "Input(COLUMN) Tensor of SequenceTopkAvgPoolingOp does "
-                      "not contain LoD information.");
+    PADDLE_ENFORCE_EQ(
+        in->lod().empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(X) Tensor of SequenceTopkAvgPoolingOp does not "
+            "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        row->lod().empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(ROW) Tensor of SequenceTopkAvgPoolingOp does not "
+            "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        col->lod().empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(COLUMN) Tensor of SequenceTopkAvgPoolingOp does "
+            "not contain LoD information."));
 
     auto channel_num = context.Attr<int>("channel_num");
     auto topks = context.Attr<std::vector<int>>("topks");
     auto k_num = topks.size();
     auto max_k = topks[topks.size() - 1];
+    PADDLE_ENFORCE_GE(max_k, 0,
+                      platform::errors::InvalidArgument(
+                          "Expected max_k >= 0, but received %d.", max_k));
     std::vector<int> vec_pos_shape;
     auto in_lod = in->lod()[0];
 
@@ -116,7 +130,10 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
       int row_size = row_lod[i + 1] - row_lod[i];
       int col_size = col_lod[i + 1] - col_lod[i];
       PADDLE_ENFORCE_EQ(total_size, channel_num * row_size * col_size,
-                        "size wrong in sequence_topk_avg_pooling_op!");
+                        platform::errors::PreconditionNotMet(
+                            "Expected total_size == channel_num * row_size * "
+                            "col_size, but got %d != %d.",
+                            total_size, channel_num * row_size * col_size));
 
       int feature_num = row_size * col_size;
       for (int j = 0; j < channel_num; ++j) {
@@ -130,14 +147,14 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
           auto out_slice_data = dout_data + row_lod[i] * channel_num * k_num +
                                 r * channel_num * k_num + j * k_num;
 
-          get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
-          if (pos_slice_data[0] == -1) {
+          details::get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
+          if (pos_slice_data[0] == TopKPosPaddingId) {
             sum_data[0] = 0.0;
           } else {
             sum_data[0] = row_data[pos_slice_data[0]];
           }
           for (int k = 1; k < max_k; ++k) {
-            if (pos_slice_data[k] == -1) {
+            if (pos_slice_data[k] == TopKPosPaddingId) {
               sum_data[k] = sum_data[k - 1];
             } else {
               sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
@@ -206,7 +223,7 @@ class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
 
           for (size_t m = 0; m < k_num; ++m) {
             for (int k = 0; k < topks[m]; ++k) {
-              if (pos_slice_data[k] == -1) {
+              if (pos_slice_data[k] == TopKPosPaddingId) {
                 break;
               } else {
                 in_slice_data[pos_slice_data[k]] += row_data[m] / topks[m];
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
index 2df4ad1339973..4b9dca0d4028b 100644
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/shape_op.h"
 
-REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
-                        paddle::operators::ShapeKernel<int32_t>,
-                        paddle::operators::ShapeKernel<int64_t>,
-                        paddle::operators::ShapeKernel<float>,
-                        paddle::operators::ShapeKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    shape, paddle::operators::ShapeKernel<int>,
+    paddle::operators::ShapeKernel<int32_t>,
+    paddle::operators::ShapeKernel<int64_t>,
+    paddle::operators::ShapeKernel<float>,
+    paddle::operators::ShapeKernel<double>,
+    paddle::operators::ShapeKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu
index d6945df9e184e..7493b18936492 100644
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -12,145 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/device_vector.h>
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/slice_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <size_t D>
-__global__ void Padding(const paddle::platform::float16* d_out,
-                        const int64_t* out_dims, const int64_t* in_dims,
-                        const int64_t* offsets, int64_t n,
-                        paddle::platform::float16* d_in) {
-  int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (out_idx < n) {
-    int64_t out_idx_tmp = out_idx;
-    int64_t coords[D] = {0};
-    for (int i = D - 1; i >= 0; --i) {
-      coords[i] = out_idx_tmp % out_dims[i];
-      out_idx_tmp /= out_dims[i];
-      coords[i] += offsets[i];
-    }
-
-    int64_t in_idx = 0;
-    for (int i = 0; i < D; ++i) {
-      in_idx = in_idx * in_dims[i] + coords[i];
-    }
-
-    d_in[in_idx] = d_out[out_idx];
-  }
-}
-
-template <>
-class SliceGradKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>
-    : public framework::OpKernel<paddle::platform::float16> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
-    d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
-
-    auto out_dims = d_out->dims();
-    auto in_dims = d_in->dims();
-    int rank = out_dims.size();
-    std::vector<int64_t> offsets(rank, 0);
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-
-    auto list_new_starts_tensor =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<framework::Tensor>("StartsTensor");
-      starts = GetDataFromTensor<int64_t>(starts_tensor);
-    }
-
-    for (size_t i = 0; i < starts.size(); ++i) {
-      if (starts[i] < 0) {
-        starts[i] += in_dims[axes[i]];
-      }
-      offsets[axes[i]] = std::max(starts[i], static_cast<int64_t>(0));
-    }
-
-    math::SetConstant<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>
-        set_zero;
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::CUDADeviceContext>();
-    set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
-
-    int64_t numel = d_out->numel();
-    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
-    dim3 threads(PADDLE_CUDA_NUM_THREADS);
-    auto stream = ctx.cuda_device_context().stream();
-    const std::vector<int64_t> out_shape =
-        framework::vectorize<int64_t>(out_dims);
-    const std::vector<int64_t> in_shape =
-        framework::vectorize<int64_t>(in_dims);
-
-    framework::Tensor out_dims_tensor;
-    framework::Tensor in_dims_tensor;
-    framework::Tensor offsets_tensor;
-    framework::TensorFromVector(out_shape, ctx.device_context(),
-                                &out_dims_tensor);
-    framework::TensorFromVector(in_shape, ctx.device_context(),
-                                &in_dims_tensor);
-    framework::TensorFromVector(offsets, ctx.device_context(), &offsets_tensor);
-    const int64_t* out_dims_ptr = out_dims_tensor.data<int64_t>();
-    const int64_t* in_dims_ptr = in_dims_tensor.data<int64_t>();
-    const int64_t* offsets_ptr = offsets_tensor.data<int64_t>();
-
-    switch (rank) {
-      case 1:
-        Padding<1><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 2:
-        Padding<2><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 3:
-        Padding<3><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 4:
-        Padding<4><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 5:
-        Padding<5><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 6:
-        Padding<6><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
     slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 39cc605f6b318..ee46f4d821c78 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -350,7 +350,7 @@ class SliceGradKernel : public framework::OpKernel<T> {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
       auto& dev_ctx = *pool.Get(context.GetPlace());
-      T value = 0.0;
+      T value = T(0);
       math::SetConstant<DeviceContext, T> functor;
       for (int i = 0; i < d_in_size; ++i) {
         auto dim = input_array->at(i).dims();
@@ -440,7 +440,7 @@ class SliceGradKernel : public framework::OpKernel<T> {
     auto d_out_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
             *d_out, out_dims);
-    d_in_t.device(place) = d_out_t.pad(paddings, 0);
+    d_in_t.device(place) = d_out_t.pad(paddings, T(0));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 344dfe23996fd..ba56e5e36f985 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -27,9 +27,11 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
   CUDA_KERNEL_LOOP(index, n * remain) {
     int idx_n = index / remain;
     int idx_remain = index % remain;
-    int idx = idx_n * d + labels[index] * remain + idx_remain;
-    logit_grad[idx] -=
-        ignore_index == labels[index] ? static_cast<T>(0.) : static_cast<T>(1.);
+    int tmp = labels[index];
+    if (ignore_index != tmp) {
+      int idx = idx_n * d + tmp * remain + idx_remain;
+      logit_grad[idx] -= static_cast<T>(1.);
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index b658e78629cc2..859776bc2a0f0 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -13,15 +13,73 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/squeeze_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
+                               const framework::DDim &in_dims,
+                               bool is_runtime) {
+  size_t num_squeeze_dims = squeeze_dims.size();
+  std::vector<bool> should_squeeze(in_dims.size(), false);
+
+  // Mark dimensions need to be squeezed.
+  if (num_squeeze_dims == 0) {
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (in_dims[i] == 1) {
+        should_squeeze[i] = true;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < num_squeeze_dims; ++i) {
+      int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size()
+                                        : squeeze_dims[i];
+
+      PADDLE_ENFORCE_GE(
+          current, 0,
+          platform::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(), in_dims.size() - 1, current, in_dims));
+      PADDLE_ENFORCE_LT(
+          current, in_dims.size(),
+          platform::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(), in_dims.size() - 1, current, in_dims));
+
+      if (!should_squeeze[current]) {
+        if (is_runtime) {
+          // At run time, dim of 1 is allowed to squeeze
+          if (in_dims[current] == 1) {
+            should_squeeze[current] = true;
+          }
+        } else {
+          // At compile time, dim of -1 or 1 is allowed to squeeze
+          if (in_dims[current] == 1 || in_dims[current] == -1) {
+            should_squeeze[current] = true;
+          }
+        }
+      }
+    }
+  }
+  // Make output dimensions
+  std::vector<int64_t> output_shape;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (!should_squeeze[i]) {
+      output_shape.push_back(in_dims[i]);
+    }
+  }
+  return framework::make_ddim(output_shape);
+}
+
 class SqueezeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -40,7 +98,7 @@ class SqueezeOp : public framework::OperatorWithKernel {
                           x_dims.size(), x_dims));
 
     const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    auto out_dims = GetOutputShape(axes, x_dims);
+    auto out_dims = GetOutputShape(axes, x_dims, false);
     ctx->SetOutputDim("Out", out_dims);
     if (x_dims[0] == out_dims[0]) {
       // Only pass LoD when the first dimension of output and Input(X)
@@ -49,56 +107,6 @@ class SqueezeOp : public framework::OperatorWithKernel {
     }
   }
 
-  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
-                                        const framework::DDim &in_dims) {
-    size_t num_squeeze_dims = squeeze_dims.size();
-    int cnt_squeezed_dims = 0;
-    bool should_squeeze[9] = {false};
-
-    // Determines number of dimensions of output tensor after squeeze.
-    // Mark and count the dimensions need to be squeezed
-    if (num_squeeze_dims == 0) {
-      for (int idx = 0; idx < in_dims.size(); ++idx) {
-        if (in_dims[idx] == 1) {
-          should_squeeze[idx] = true;
-          ++cnt_squeezed_dims;
-        }
-      }
-    } else {
-      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
-        int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
-                                            : squeeze_dims[idx];
-        PADDLE_ENFORCE_GE(
-            current, 0,
-            platform::errors::InvalidArgument(
-                "Each axis in Attr(axes) should be in the range of [%d, %d]"
-                "But current axis is:%d, input tensor's shape = [%s].",
-                -in_dims.size(), in_dims.size() - 1, current, in_dims));
-        PADDLE_ENFORCE_LT(
-            current, in_dims.size(),
-            platform::errors::InvalidArgument(
-                "Each axis in Attr(axes) should be in the range of [%d, %d]"
-                "But current axis is:%d, input tensor's shape = [%s].",
-                -in_dims.size(), in_dims.size() - 1, current, in_dims));
-
-        if (!(should_squeeze[current])) {
-          ++cnt_squeezed_dims;
-        }
-        should_squeeze[current] = true;
-      }
-    }
-
-    // Make output dimensions
-    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
-      if (!should_squeeze[in_idx]) {
-        output_shape[out_idx++] = in_dims[in_idx];
-      }
-    }
-
-    return framework::make_ddim(output_shape);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -183,7 +191,7 @@ class Squeeze2Op : public framework::OperatorWithKernel {
 
     const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
 
-    auto out_dims = SqueezeOp::GetOutputShape(axes, x_dims);
+    auto out_dims = GetOutputShape(axes, x_dims, false);
     ctx->SetOutputDim("Out", out_dims);
     if (x_dims[0] == out_dims[0]) {
       // Only pass LoD when the first dimension of output and Input(X)
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
index e8e53bb0f4fcd..2f621c11e58f6 100644
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -24,6 +25,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
+                               const framework::DDim &in_dims, bool is_runtime);
+
 template <typename DeviceContext, typename T>
 class SqueezeKernel : public framework::OpKernel<T> {
  public:
@@ -33,7 +37,7 @@ class SqueezeKernel : public framework::OpKernel<T> {
 
     auto &axes = context.Attr<std::vector<int>>("axes");
     auto x_dims = in->dims();
-    auto out_dims = GetOutputShape(axes, x_dims);
+    auto out_dims = GetOutputShape(axes, x_dims, true);
 
     out->mutable_data(context.GetPlace(), in->type());
     framework::TensorCopy(
@@ -41,64 +45,6 @@ class SqueezeKernel : public framework::OpKernel<T> {
         context.template device_context<platform::DeviceContext>(), out);
     out->Resize(out_dims);
   }
-
-  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
-                                        const framework::DDim &in_dims) {
-    size_t num_squeeze_dims = squeeze_dims.size();
-    int cnt_squeezed_dims = 0;
-    bool should_squeeze[9] = {false};
-
-    // Determines number of dimensions of output tensor after squeeze.
-    // Mark and count the dimensions need to be squeezed
-    if (num_squeeze_dims == 0) {
-      for (int idx = 0; idx < in_dims.size(); ++idx) {
-        if (in_dims[idx] == 1) {
-          should_squeeze[idx] = true;
-          ++cnt_squeezed_dims;
-        }
-      }
-    } else {
-      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
-        int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
-                                            : squeeze_dims[idx];
-
-        PADDLE_ENFORCE_GE(
-            current, 0,
-            platform::errors::InvalidArgument(
-                "Each axis in Attr(axes) should be in the range of [%d, %d]"
-                "But current axis is:%d, input tensor's shape = [%s].",
-                -in_dims.size(), in_dims.size() - 1, current, in_dims));
-        PADDLE_ENFORCE_LT(
-            current, in_dims.size(),
-            platform::errors::InvalidArgument(
-                "Each axis in Attr(axes) should be in the range of [%d, %d]"
-                "But current axis is:%d, input tensor's shape = [%s].",
-                -in_dims.size(), in_dims.size() - 1, current, in_dims));
-
-        PADDLE_ENFORCE_EQ(in_dims[current], 1,
-                          platform::errors::InvalidArgument(
-                              "The size of axis that will be squeezed "
-                              "should be equal to 1. But current axis = %d,"
-                              "input tensor's shape = [%s].",
-                              in_dims[current], in_dims));
-
-        if (!(should_squeeze[current])) {
-          ++cnt_squeezed_dims;
-        }
-        should_squeeze[current] = true;
-      }
-    }
-
-    // Make output dimensions
-    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
-      if (!should_squeeze[in_idx]) {
-        output_shape[out_idx++] = in_dims[in_idx];
-      }
-    }
-
-    return framework::make_ddim(output_shape);
-  }
 };
 
 template <typename DeviceContext, typename T>
@@ -126,8 +72,7 @@ class Squeeze2Kernel : public framework::OpKernel<T> {
     auto &axes = context.Attr<std::vector<int>>("axes");
 
     auto x_dims = in->dims();
-    auto out_dims =
-        SqueezeKernel<DeviceContext, T>::GetOutputShape(axes, x_dims);
+    auto out_dims = GetOutputShape(axes, x_dims, true);
 
     out->mutable_data(context.GetPlace(), in->type());
     framework::TensorCopy(
diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu
index e9e55c20fc5be..79dd29ebc691c 100644
--- a/paddle/fluid/operators/transpose_op.cu
+++ b/paddle/fluid/operators/transpose_op.cu
@@ -660,19 +660,26 @@ template <typename DeviceContext, typename T>
 class TransposeGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    if (out->numel() == 0) {
+    auto* x = context.InputVar("X");
+    auto* out = context.OutputVar("Out");
+
+    const framework::Tensor* x_tensor =
+        GetLoDTensorOrSelectedRowsValueFromVar(*x);
+    framework::Tensor* out_tensor =
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    if (out_tensor->numel() == 0) {
       return;
     }
 
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
     int ndims = axis.size();
     const auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto ret = TransposeSimple<T>::run(dev_ctx, *x, axis, out);
+    auto ret = TransposeSimple<T>::run(dev_ctx, *x_tensor, axis, out_tensor);
     if (!ret) {
-      TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
+      TransCompute<DeviceContext, T>(ndims, dev_ctx, *x_tensor, out_tensor,
+                                     axis);
     }
   }
 };
@@ -680,14 +687,19 @@ template <typename DeviceContext, typename T>
 class TransposeGradGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (!x_grad) return;
-
-    x_grad->mutable_data<T>(context.GetPlace());
-    if (x_grad->numel() == 0) {
+    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
+    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
+    if (!x_grad) {
+      return;
+    }
+
+    const framework::Tensor* out_grad_tensor =
+        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
+    framework::Tensor* x_grad_tensor =
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
+
+    x_grad_tensor->mutable_data<T>(context.GetPlace());
+    if (x_grad_tensor->numel() == 0) {
       return;
     }
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
@@ -699,11 +711,11 @@ class TransposeGradGPUKernel : public framework::OpKernel<T> {
 
     int ndims = axis.size();
     const auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto ret =
-        TransposeSimple<T>::run(dev_ctx, *out_grad, reversed_axis, x_grad);
+    auto ret = TransposeSimple<T>::run(dev_ctx, *out_grad_tensor, reversed_axis,
+                                       x_grad_tensor);
     if (!ret) {
-      TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
-                                     reversed_axis);
+      TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad_tensor,
+                                     x_grad_tensor, reversed_axis);
     }
   }
 };
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index f2951e90ebe88..d7f5c3dd457c9 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -64,16 +64,23 @@ template <typename DeviceContext, typename T>
 class TransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    if (out->numel() == 0) {
+    auto* x = context.InputVar("X");
+    auto* out = context.OutputVar("Out");
+
+    const framework::Tensor* x_tensor =
+        GetLoDTensorOrSelectedRowsValueFromVar(*x);
+    framework::Tensor* out_tensor =
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    if (out_tensor->numel() == 0) {
       return;
     }
+
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
     int ndims = axis.size();
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x_tensor, out_tensor, axis);
   }
 };
 
@@ -81,14 +88,19 @@ template <typename DeviceContext, typename T>
 class TransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (!x_grad) return;
-
-    x_grad->mutable_data<T>(context.GetPlace());
-    if (x_grad->numel() == 0) {
+    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
+    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
+
+    if (!x_grad) {
+      return;
+    }
+    const framework::Tensor* out_grad_tensor =
+        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
+    framework::Tensor* x_grad_tensor =
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
+
+    x_grad_tensor->mutable_data<T>(context.GetPlace());
+    if (x_grad_tensor->numel() == 0) {
       return;
     }
 
@@ -101,8 +113,8 @@ class TransposeGradKernel : public framework::OpKernel<T> {
 
     int ndims = axis.size();
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
-                                   reversed_axis);
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad_tensor,
+                                   x_grad_tensor, reversed_axis);
   }
 };
 
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index f632550c65182..cc19fd5ac4985 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -81,7 +81,7 @@ class NCCLCommContext {
     PADDLE_ENFORCE_GT(
         comm_map_.count(ring_id), 0,
         platform::errors::InvalidArgument(
-            "Comunicator in ring id %d has not been initialized.", ring_id));
+            "Communicator in ring id %d has not been initialized.", ring_id));
     PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
                       platform::errors::InvalidArgument(
                           "One device id should be specified to retrieve from "
@@ -94,11 +94,11 @@ class NCCLCommContext {
     PADDLE_ENFORCE_GT(
         comm_map_.count(ring_id), 0,
         platform::errors::InvalidArgument(
-            "Comunicator of ring id %d has not been initialized.", ring_id));
+            "Communicator of ring id %d has not been initialized.", ring_id));
     PADDLE_ENFORCE_GT(
         comm_map_.at(ring_id).count(dev_id), 0,
         platform::errors::InvalidArgument(
-            "Comunicator at device id %d has not been initialized in ring %d.",
+            "Communicator at device id %d has not been initialized in ring %d.",
             dev_id, ring_id));
     return comm_map_.at(ring_id).at(dev_id).get();
   }
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index dbc4e813d6f0f..a402f397348a4 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -44,8 +44,8 @@ void SetNumThreads(int num_threads) {
   omp_set_num_threads(real_num_threads);
 #else
   PADDLE_THROW(platform::errors::Unimplemented(
-      "The library (except OPENBLAS, MKLML) is to be implemented, thus "
-      "number of threads can not be set."));
+      "This library (except OPENBLAS, MKLML) is not supported yet, so the"
+      "number of threads cannot be set."));
 #endif
 }
 
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 63760ada2b4d5..b86fd70c9aecd 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -23,7 +23,9 @@ limitations under the License. */
 #include <sys/sysctl.h>
 #include <sys/types.h>
 #elif defined(_WIN32)
+#ifndef NOMINMAX
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
 #include <windows.h>
 #else
 #include <unistd.h>
diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/cuda_resource_pool.cc
index 65c8b96028ace..6ecb312d72072 100644
--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/cuda_resource_pool.cc
@@ -50,11 +50,11 @@ std::shared_ptr<CudaStreamObject> CudaStreamResourcePool::New(int dev_idx) {
   PADDLE_ENFORCE_GE(
       dev_idx, 0,
       platform::errors::InvalidArgument(
-          "dev_idx should be not less than 0, but got %d", dev_idx));
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
   PADDLE_ENFORCE_LT(
       dev_idx, pool_.size(),
       platform::errors::OutOfRange(
-          "dev_idx should be less than device count %d, but got %d",
+          "The dev_idx should be less than device count %d, but got %d.",
           pool_.size(), dev_idx));
   return pool_[dev_idx]->New();
 }
@@ -89,11 +89,11 @@ std::shared_ptr<CudaEventObject> CudaEventResourcePool::New(int dev_idx) {
   PADDLE_ENFORCE_GE(
       dev_idx, 0,
       platform::errors::InvalidArgument(
-          "dev_idx should be not less than 0, but got %d", dev_idx));
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
   PADDLE_ENFORCE_LT(
       dev_idx, pool_.size(),
       platform::errors::OutOfRange(
-          "dev_idx should be less than device count %d, but got %d",
+          "The dev_idx should be less than device count %d, but got %d.",
           pool_.size(), dev_idx));
   return pool_[dev_idx]->New();
 }
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index b1da9862aa81a..efb57e12fdbe6 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -142,8 +142,8 @@ inline ActivationMode StringToActivationMode(const std::string& str) {
   } else if (str == "bandpass") {
     return ActivationMode::kBandPass;
   } else {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("Unknown activation string: %s.", str));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unknown CUDNN activation string: %s.", str));
   }
 }
 
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
index e8b2d5d4ed12d..9d5a0954b00b1 100644
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -60,10 +60,10 @@ platform::DeviceCode* DeviceCodePool::Get(const platform::Place& place,
 }
 
 DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(
-      places.size(), 0,
-      errors::InvalidArgument(
-          "Expected the number of places >= 1. Expected %d.", places.size()));
+  PADDLE_ENFORCE_GT(places.size(), 0,
+                    errors::InvalidArgument(
+                        "Expected the number of places >= 1. But received %d.",
+                        places.size()));
   // Remove the duplicated places
   std::set<Place> set;
   for (auto& p : places) {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 72733f5153b34..38b0894c3f71d 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -103,9 +103,9 @@ DeviceContextPool::DeviceContextPool(
 #ifdef PADDLE_WITH_CUDA
       EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "'CUDAPlace is not supported. Please re-compile with WITH_GPU."
-          "option"));
+      PADDLE_THROW(
+          platform::errors::Unimplemented("CUDAPlace is not supported. Please "
+                                          "re-compile with WITH_GPU option."));
 #endif
     } else if (platform::is_cuda_pinned_place(p)) {
 #ifdef PADDLE_WITH_CUDA
@@ -113,8 +113,8 @@ DeviceContextPool::DeviceContextPool(
           &device_contexts_, p);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
-          "'CUDAPlace' is not supported. Please re-compile with WITH_GPU."
-          "option"));
+          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
+          "option."));
 #endif
     }
   }
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index b944fead0935b..82e4f6ac75ec1 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -172,13 +172,19 @@ static inline void* GetDsoHandleFromSearchPath(
   // 5. [If Failed] logging or throw error info
   if (nullptr == dso_handle) {
     auto error_msg =
-        "Failed to find dynamic library: %s ( %s ) \n"
-        "Please specify its path correctly using following ways: \n"
-        "  set environment variable LD_LIBRARY_PATH on Linux or "
-        "DYLD_LIBRARY_PATH on Mac OS. \n"
-        "  For instance, issue command: export LD_LIBRARY_PATH=... \n"
-        "  Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
-        "impossible unless System Integrity Protection (SIP) is disabled.";
+        "The third-party dynamic library (%s) that Paddle depends on is not "
+        "configured correctly. (error code is %s)\n"
+        "  Suggestions:\n"
+        "  1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
+        "is installed correctly and its version is matched with paddlepaddle "
+        "you installed.\n"
+        "  2. Configure third-party dynamic library environment variables as "
+        "follows:\n"
+        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
+        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
+        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
+        "impossible unless System Integrity Protection (SIP) is disabled.]";
 #if !defined(_WIN32)
     auto errorno = dlerror();
 #else
@@ -186,7 +192,8 @@ static inline void* GetDsoHandleFromSearchPath(
 #endif  // !_WIN32
     if (throw_on_error) {
       // NOTE: Special error report case, no need to change its format
-      PADDLE_THROW(platform::errors::NotFound(error_msg, dso_name, errorno));
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet(error_msg, dso_name, errorno));
     } else {
       LOG(WARNING) << string::Sprintf(error_msg, dso_name, errorno);
     }
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 35fa9e88b481a..566f887014b94 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -30,21 +30,25 @@ namespace dynload {
 extern std::once_flag tensorrt_dso_flag;
 extern void* tensorrt_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
-  struct DynLoad__##__name {                                            \
-    template <typename... Args>                                         \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
-      using tensorrt_func = decltype(&::__name);                        \
-      std::call_once(tensorrt_dso_flag, []() {                          \
-        tensorrt_dso_handle =                                           \
-            paddle::platform::dynload::GetTensorRtDsoHandle();          \
-        PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
-      });                                                               \
-      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);    \
-      PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
-      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
-    }                                                                   \
-  };                                                                    \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                  \
+    template <typename... Args>                                               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
+      using tensorrt_func = decltype(&::__name);                              \
+      std::call_once(tensorrt_dso_flag, []() {                                \
+        tensorrt_dso_handle =                                                 \
+            paddle::platform::dynload::GetTensorRtDsoHandle();                \
+        PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle,                          \
+                                platform::errors::Unavailable(                \
+                                    "Load tensorrt %s failed", #__name));     \
+      });                                                                     \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
+      PADDLE_ENFORCE_NOT_NULL(                                                \
+          p_##__name,                                                         \
+          platform::errors::Unavailable("Load tensorrt %s failed", #__name)); \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
+    }                                                                         \
+  };                                                                          \
   extern DynLoad__##__name __name
 
 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 5d755d8c830c1..475256826f360 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -19,9 +19,11 @@ limitations under the License. */
 #endif               // __GNUC__
 
 #if !defined(_WIN32)
-#include <dlfcn.h>    // dladdr
-#else                 // _WIN32
-#define NOMINMAX      // msvc max/min macro conflict with std::min/max
+#include <dlfcn.h>  // dladdr
+#else               // _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
 #include <windows.h>  // GetModuleFileName
 #endif
 
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 2fddc23b43a61..5f63233d8bee4 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -344,10 +344,10 @@ class RecordedCudaMallocHelper {
     PADDLE_ENFORCE_GE(
         dev_id, 0,
         platform::errors::OutOfRange(
-            "Device id must be not less than 0, but got %d", dev_id));
+            "Device id must be not less than 0, but got %d.", dev_id));
     PADDLE_ENFORCE_LT(
         dev_id, instances_.size(),
-        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d",
+        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
                                      dev_id, instances_.size()));
     return instances_[dev_id].get();
   }
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h
index d57478b89781e..fd6e80527caf6 100644
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -31,9 +31,10 @@ struct GpuLaunchConfig {
 };
 
 inline GpuLaunchConfig getGpuLaunchConfig(
-    const int N, const framework::ExecutionContext& ctx) {
+    const int N, const framework::ExecutionContext& ctx,
+    int max_threads = 1024) {
   int threads =
-      std::min(1024, ctx.cuda_device_context().GetMaxThreadsPerBlock());
+      std::min(max_threads, ctx.cuda_device_context().GetMaxThreadsPerBlock());
   int physical_thread_count =
       std::min(ctx.cuda_device_context().GetMaxPhysicalThreadCount(), N);
   int blocks = std::min((physical_thread_count + threads - 1) / threads,
diff --git a/paddle/fluid/platform/gpu_launch_param_config.h b/paddle/fluid/platform/gpu_launch_param_config.h
index c1ea06336002f..40f4ef975e76c 100755
--- a/paddle/fluid/platform/gpu_launch_param_config.h
+++ b/paddle/fluid/platform/gpu_launch_param_config.h
@@ -39,7 +39,7 @@ inline GpuLaunchParamConfig GetGpuLaunchConfig1D(
     const platform::CUDADeviceContext& context, int element_count) {
   PADDLE_ENFORCE_GT(element_count, 0, platform::errors::InvalidArgument(
                                           "element count should greater than 0,"
-                                          " but received value is:%d",
+                                          " but received value is %d.",
                                           element_count));
 
   const int theory_thread_count = element_count;
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 9753a39c40c37..d9c8026bd285e 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -117,14 +117,18 @@ void InitCupti() {
 #ifdef PADDLE_WITH_CUPTI
   if (FLAGS_multiple_of_cupti_buffer_size == 1) return;
   size_t attrValue = 0, attrValueSize = sizeof(size_t);
-#define MULTIPLY_ATTR_VALUE(attr)                                 \
-  {                                                               \
-    PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \
-        attr, &attrValueSize, &attrValue));                       \
-    attrValue *= FLAGS_multiple_of_cupti_buffer_size;             \
-    LOG(WARNING) << "Set " #attr " " << attrValue << " byte";     \
-    PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \
-        attr, &attrValueSize, &attrValue));                       \
+#define MULTIPLY_ATTR_VALUE(attr)                                            \
+  {                                                                          \
+    PADDLE_ENFORCE_EQ(                                                       \
+        !platform::dynload::cuptiActivityGetAttribute(attr, &attrValueSize,  \
+                                                      &attrValue),           \
+        true, platform::errors::Unavailable("Get cupti attribute failed.")); \
+    attrValue *= FLAGS_multiple_of_cupti_buffer_size;                        \
+    LOG(WARNING) << "Set " #attr " " << attrValue << " byte";                \
+    PADDLE_ENFORCE_EQ(                                                       \
+        !platform::dynload::cuptiActivitySetAttribute(attr, &attrValueSize,  \
+                                                      &attrValue),           \
+        true, platform::errors::Unavailable("Set cupti attribute failed.")); \
   }
   MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE);
   MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP);
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 9204cde29182a..5d7143f56b3f3 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -54,7 +54,7 @@ class MKLDNNHandlerT {
   }
 
   std::shared_ptr<TForward> AcquireForwardPrimitive() {
-    const std::string key_p = key_ + "@forward_p";
+    const std::string key_p = key_ + "@fwd_p";
     auto forward_p =
         std::static_pointer_cast<TForward>(dev_ctx_.GetBlob(key_p));
     if (forward_p == nullptr) {
@@ -65,7 +65,7 @@ class MKLDNNHandlerT {
   }
 
   std::shared_ptr<TBackward> AcquireBackwardPrimitive() {
-    const std::string key_p = key_ + "@backward_p";
+    const std::string key_p = key_ + "@bwd_p";
     auto backward_p =
         std::static_pointer_cast<TBackward>(dev_ctx_.GetBlob(key_p));
     if (backward_p == nullptr) {
@@ -112,11 +112,11 @@ class MKLDNNHandlerT {
 
  protected:
   bool isCached() {
-    const std::string key_pd = key_common_ + "@forward_pd";
+    const std::string key_pd = key_common_ + "@fwd_pd";
     fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
         dev_ctx_.GetBlob(key_pd));
 
-    const std::string key_p = key_ + "@forward_p";
+    const std::string key_p = key_ + "@fwd_p";
     return (dev_ctx_.GetBlob(key_p) != nullptr);
   }
 
@@ -129,7 +129,7 @@ class MKLDNNHandlerT {
     // Forward PD has to be passed to Grad op that
     // may be executed by diffrent thread, hence
     // for that one we use key that does not contain TID
-    const std::string key_pd = key_common_ + "@forward_pd";
+    const std::string key_pd = key_common_ + "@fwd_pd";
     fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
         dev_ctx_.GetBlob(key_pd));
     if (fwd_pd_ == nullptr) {
@@ -169,13 +169,13 @@ class MKLDNNHandlerT {
 
   template <typename... Args>
   void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
-    const std::string key_fwd_pd = key_common_ + "@forward_pd";
+    const std::string key_fwd_pd = key_common_ + "@fwd_pd";
     fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
         dev_ctx_.GetBlob(key_fwd_pd));
     PADDLE_ENFORCE_NOT_NULL(
         fwd_pd_, platform::errors::Unavailable(
                      "Get MKLDNN Forward primitive %s failed.", key_fwd_pd));
-    const std::string key_pd = key_ + "@backward_pd";
+    const std::string key_pd = key_ + "@bwd_pd";
     bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
         dev_ctx_.GetBlob(key_pd));
     if (bwd_pd_ == nullptr) {
@@ -500,17 +500,17 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
     if (!this->isCached()) {
       PADDLE_ENFORCE_EQ(
           x->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument("Wrong layout set for X tensor"));
+          platform::errors::InvalidArgument("Wrong layout set for X tensor."));
       PADDLE_ENFORCE_NE(
           x->format(), MKLDNNMemoryFormat::undef,
-          platform::errors::InvalidArgument("Wrong format set for X tensor"));
+          platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
       PADDLE_ENFORCE_EQ(
           y->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument("Wrong layout set for Y tensor"));
+          platform::errors::InvalidArgument("Wrong layout set for Y tensor."));
       PADDLE_ENFORCE_NE(
           y->format(), MKLDNNMemoryFormat::undef,
-          platform::errors::InvalidArgument("Wrong format set for Y tensor"));
+          platform::errors::InvalidArgument("Wrong format set for Y tensor."));
 
       const auto src_x_tz = framework::vectorize(x->dims());
       const auto src_y_tz = framework::vectorize(y->dims());
@@ -774,10 +774,10 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
     if (!this->isCached()) {
       PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
                         platform::errors::InvalidArgument(
-                            "Wrong layout set for Input tensor"));
+                            "Wrong layout set for Input tensor."));
       PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
                         platform::errors::InvalidArgument(
-                            "Wrong format set for Input tensor"));
+                            "Wrong format set for Input tensor."));
 
       const std::string pooling_type = ctx.Attr<std::string>("pooling_type");
 
@@ -795,15 +795,21 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
           ctx.Attr<std::string>("padding_algorithm");
 
       // Only 2D pooling is supported now
-      PADDLE_ENFORCE_EQ(ksize.size(), 2,
-                        platform::errors::InvalidArgument(
-                            "ksize must be 2D, i.e. 2D pooling"));
-      PADDLE_ENFORCE_EQ(pooling_type == "max" || pooling_type == "avg", true,
-                        platform::errors::InvalidArgument(
-                            "pooling_type must be 'max' or 'avg'"));
-      PADDLE_ENFORCE_EQ(input->dims().size(), 4,
-                        platform::errors::InvalidArgument(
-                            "Input dim must be with 4, i.e. NCHW"));
+      PADDLE_ENFORCE_EQ(
+          ksize.size(), 2,
+          platform::errors::InvalidArgument(
+              "The ksize must be 2D, i.e. 2D pooling, but received %dD.",
+              ksize.size()));
+      PADDLE_ENFORCE_EQ(
+          pooling_type == "max" || pooling_type == "avg", true,
+          platform::errors::InvalidArgument(
+              "The pooling_type must be 'max' or 'avg', but received %s.",
+              pooling_type));
+      PADDLE_ENFORCE_EQ(
+          input->dims().size(), 4,
+          platform::errors::InvalidArgument(
+              "Input dim must be with 4, i.e. NCHW, but received %d.",
+              input->dims().size()));
 
       const auto input_dims = input->dims();
       framework::DDim data_dims =
@@ -1421,7 +1427,7 @@ static std::shared_ptr<mkldnn::memory> SetDstMemory(
       residual_param_data,
       platform::errors::PreconditionNotMet("Residual parameter is required for "
                                            "the DNNL conv+elementwise_add "
-                                           "fusion, but now it is missing"));
+                                           "fusion, but now it is missing."));
   std::shared_ptr<mkldnn::memory> user_residual_memory_p =
       handler->AcquireResidualDataMemory(user_residual_md,
                                          to_void_cast<T>(residual_param_data));
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 8ae88746fc992..22550de5b3fad 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -96,8 +96,9 @@ struct NCCLContextMap {
   explicit NCCLContextMap(const std::vector<platform::Place> &places,
                           ncclUniqueId *nccl_id = nullptr,
                           size_t num_trainers = 1, size_t trainer_id = 0) {
-    PADDLE_ENFORCE_EQ(!places.empty(), true, platform::errors::InvalidArgument(
-                                                 "The NCCL place is empty."));
+    PADDLE_ENFORCE_EQ(!places.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The NCCL place should not be empty."));
     order_.reserve(places.size());
     for (auto &p : places) {
       int dev_id = BOOST_GET_CONST(CUDAPlace, p).device;
@@ -276,8 +277,9 @@ class NCCLCommunicator {
 
     PADDLE_ENFORCE_GT(
         inter_trainers_num, 1,
-        platform::errors::InvalidArgument("inter_trainers_num:%llu must > 1",
-                                          inter_trainers_num));
+        platform::errors::InvalidArgument(
+            "The inter_trainers_num:%llu should be larger than 1.",
+            inter_trainers_num));
 
     int inter_trainer_id = trainer_id % inter_trainers_num;
     for (size_t i = 0; i < inter_nccl_ids.size(); i++) {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index fc1d9a8799962..85759bc6e2ea3 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -94,10 +94,9 @@ void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
   if (g_state == ProfilerState::kDisabled) return;
   std::lock_guard<std::mutex> guard(mtx_);
   auto &events = address_memevent_[place];
-  PADDLE_ENFORCE_EQ(
-      events.count(ptr), 0,
-      platform::errors::InvalidArgument(
-          "The Place can't  exist in the stage of PushMemRecord"));
+  PADDLE_ENFORCE_EQ(events.count(ptr), 0,
+                    platform::errors::InvalidArgument(
+                        "The Place can't exist in the stage of PushMemRecord"));
   events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
                           new MemEvenRecorder::RecordMemEvent(place, size)));
 }
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 36c577fa0503b..c79195aa0db0d 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -570,7 +570,7 @@ void PrintProfiler(
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Except profiler state must to be one of ['CPU', 'GPU' 'ALL'], but "
-          "received Invalid profiler state"));
+          "received Invalid profiler state."));
     }
 
     if (merge_thread) {
diff --git a/paddle/fluid/platform/resource_pool.h b/paddle/fluid/platform/resource_pool.h
index d988d12a759bd..3603c0f24f279 100644
--- a/paddle/fluid/platform/resource_pool.h
+++ b/paddle/fluid/platform/resource_pool.h
@@ -60,7 +60,7 @@ class ResourcePool : public std::enable_shared_from_this<ResourcePool<T>> {
       obj = creator_();
       PADDLE_ENFORCE_NOT_NULL(obj,
                               platform::errors::PermissionDenied(
-                                  "The creator should not return nullptr"));
+                                  "The creator should not return nullptr."));
       VLOG(10) << "Create new instance " << TypePtrName();
     } else {
       obj = instances_.back();
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 626f6b1ecc217..82941c5828056 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -721,11 +721,11 @@ void BindImperative(py::module *m_ptr) {
       .def("_run_backward",
            [](imperative::VarBase &self,
               const imperative::detail::BackwardStrategy &bckst,
-              const imperative::Tracer &tracer) {
+              const imperative::Tracer &tracer, bool retain_graph) {
              // TODO(jiabin): when we impl more backward execution we can
              // select them
              auto *engine = tracer.GetEngine();
-             engine->Init(&self, bckst);
+             engine->Init(&self, bckst, retain_graph);
              VLOG(3) << "Start backward";
              engine->Execute();
              VLOG(3) << "Finish backward";
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 5a0b18a34f768..ed6e18699d4cb 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -433,6 +433,7 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("disable_trt_plugin_fp16") = false)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
       .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
+           py::arg("zero_copy") = false,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
            py::arg("passes_filter") = std::vector<std::string>(),
            py::arg("ops_filter") = std::vector<std::string>())
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
new file mode 100644
index 0000000000000..0c96906afb917
--- /dev/null
+++ b/paddle/scripts/paddle_build.bat
@@ -0,0 +1,239 @@
+@ECHO OFF
+SETLOCAL
+
+set work_dir=%cd%
+if not defined BRANCH set BRANCH=develop
+if not defined PYTHON_ROOT set PYTHON_ROOT=c:\Python27
+if not defined WITH_MKL set WITH_MKL=ON
+if not defined WITH_AVX set WITH_AVX=ON
+if not defined WITH_AVX set WITH_AVX=ON
+if not defined WITH_GPU set WITH_GPU=OFF
+if not defined WITH_TESTING set WITH_TESTING=ON
+if not defined WITH_PYTHON set WITH_PYTHON=ON
+if not defined ON_INFER set ON_INFER=ON
+if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=OFF
+if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=d:/.cache/inference_demo
+if not defined THIRD_PARTY_PATH set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
+set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
+dir d:\.cache
+
+goto :CASE_%1
+
+echo "Usage: paddle_build.bat [OPTION]"
+echo "OPTION:"
+echo "wincheck_mkl: run Windows MKL/GPU/UnitTest CI tasks on Windows"
+echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
+exit /b 1
+
+:CASE_wincheck_mkl
+call :cmake || goto cmake_error
+call :build || goto build_error
+call :test_whl_pacakage || goto test_whl_pacakage_error
+call :unit_test || goto unit_test_error
+call :test_inference || goto test_inference_error
+call :check_change_of_unittest || goto check_change_of_unittest_error
+goto:success
+
+:CASE_wincheck_openblas
+call :cmake || goto cmake_error
+call :build || goto build_error
+call :test_whl_pacakage || goto test_whl_pacakage_error
+goto:success
+
+rem ---------------------------------------------------------------------------------------------
+:cmake
+echo    ========================================
+echo    Step 1. Cmake ...
+echo    ========================================
+
+mkdir build
+cd /d build
+cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH%
+goto:eof
+
+:cmake_error
+exit /b %ERRORLEVEL%
+
+rem ---------------------------------------------------------------------------------------------
+:build
+echo    ========================================
+echo    Step 2. Buile Paddle ...
+echo    ========================================
+call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
+set build_times=1
+
+:build_tp
+echo BUILD THIRD_PARTY %build_times%
+msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
+echo BUILD THIRD_PARTY RESULT %ERRORLEVEL%
+if %ERRORLEVEL% NEQ 0 (
+    set /a build_times=%build_times%+1  
+    if %build_times% GTR 3 (
+        exit /b 1
+    ) else (
+        goto :build_tp
+    )
+)
+
+set build_times=1
+:build_paddle
+echo BUILD PADDLE %build_times%
+msbuild /m /p:Configuration=Release /verbosity:quiet paddle.sln
+echo BUILD PADDLE RESULT %ERRORLEVEL%
+if %ERRORLEVEL% NEQ 0 (
+    set /a build_times=%build_times%+1
+    if %build_times% GTR 2 (
+        exit /b 1
+    ) else (
+        goto :build_paddle
+    )
+)
+goto:eof
+
+:build_error
+exit /b %ERRORLEVEL%
+
+rem ---------------------------------------------------------------------------------------------
+:test_whl_pacakage
+echo    ========================================
+echo    Step 3. Test pip install whl package ...
+echo    ========================================
+dir /s /b python\dist\*.whl > whl_file.txt
+set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
+%PYTHON_EXECUTABLE% -m pip install -U %PADDLE_WHL_FILE_WIN%
+echo import paddle.fluid;print(paddle.__version__) > test_whl.py
+%PYTHON_EXECUTABLE% test_whl.py
+goto:eof
+
+:test_whl_pacakage_error
+exit /b %ERRORLEVEL%
+
+rem ---------------------------------------------------------------------------------------------
+:unit_test
+echo    ========================================
+echo    Step 4. Running unit tests ...
+echo    ========================================
+%PYTHON_EXECUTABLE% -m pip install --upgrade pip
+dir %work_dir%\build\third_party\install\openblas\lib
+dir %work_dir%\build\third_party\install\openblas\bin
+dir %work_dir%\build\third_party\install\zlib\bin
+dir %work_dir%\build\third_party\install\mklml\lib
+dir %work_dir%\build\third_party\install\mkldnn\bin
+dir %work_dir%\build\third_party\install\warpctc\bin
+
+set PATH=%work_dir%\build\third_party\install\openblas\lib;%work_dir%\build\third_party\install\openblas\bin;%work_dir%\build\third_party\install\zlib\bin;%work_dir%\build\third_party\install\mklml\lib;%work_dir%\build\third_party\install\mkldnn\bin;%work_dir%\build\third_party\install\warpctc\bin;%PATH%
+ctest.exe --output-on-failure -C Release -j 10
+goto:eof
+
+:unit_test_error
+exit /b %ERRORLEVEL%
+
+rem ---------------------------------------------------------------------------------------------
+:test_inference
+echo    ========================================
+echo    Step 5. Testing fluid library for inference ...
+echo    ========================================
+if NOT EXIST "d:\.cache\tools" (
+  git clone https://github.com/zhouwei25/tools.git d:\.cache\tools
+)
+cd %work_dir%\paddle\fluid\inference\api\demo_ci
+
+d:\.cache\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% d:/.cache/inference_demo
+goto:eof
+
+:test_inference_error
+exit /b %ERRORLEVEL%
+
+rem ---------------------------------------------------------------------------------------------
+:check_change_of_unittest
+echo    ========================================
+echo    Step 6. Check whether deleting a unit test ...
+echo    ========================================
+
+set PATH=%PYTHON_ROOT%;%PATH%
+cd /d %work_dir%\build
+echo set -ex>  check_change_of_unittest.sh
+echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >>  check_change_of_unittest.sh
+echo GIT_PR_ID=%AGILE_PULL_ID% >>  check_change_of_unittest.sh
+echo BRANCH=%BRANCH%>>  check_change_of_unittest.sh
+echo if [ "${GITHUB_API_TOKEN}" == "" ] ^|^| [ "${GIT_PR_ID}" == "" ];then>> check_change_of_unittest.sh
+echo     exit 0 >>  check_change_of_unittest.sh
+echo fi>>  check_change_of_unittest.sh
+echo cat ^<^<EOF>>  check_change_of_unittest.sh
+echo     ============================================ >>  check_change_of_unittest.sh
+echo     Generate unit tests.spec of this PR.         >>  check_change_of_unittest.sh
+echo     ============================================ >>  check_change_of_unittest.sh
+echo EOF>>  check_change_of_unittest.sh
+echo spec_path=$(pwd)/../paddle/fluid/UNITTEST_PR.spec>>  check_change_of_unittest.sh
+echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
+echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_unittest.sh
+echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
+echo if [ "$origin_upstream_url" == "" ]; then>>  check_change_of_unittest.sh
+echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
+echo elif [ "$origin_upstream_url" != "$UPSTREAM_URL" ] \>>  check_change_of_unittest.sh
+echo         ^&^& [ "$origin_upstream_url" != "$UPSTREAM_URL.git" ]; then>>  check_change_of_unittest.sh
+echo     git remote remove upstream>>  check_change_of_unittest.sh
+echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
+echo fi>>  check_change_of_unittest.sh
+echo if [ ! -e "$(pwd)/../.git/refs/remotes/upstream/$BRANCH" ]; then>>  check_change_of_unittest.sh
+echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_unittest.sh
+echo fi>>  check_change_of_unittest.sh
+echo git checkout -b origin_pr >>  check_change_of_unittest.sh
+echo git checkout -b test_pr -t upstream/$BRANCH >>  check_change_of_unittest.sh
+echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE:\=\\% -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" -DON_INFER=%ON_INFER% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% >>  check_change_of_unittest.sh
+echo cat ^<^<EOF>>  check_change_of_unittest.sh
+echo     ============================================       >>  check_change_of_unittest.sh
+echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
+echo     ============================================       >>  check_change_of_unittest.sh
+echo EOF>>  check_change_of_unittest.sh
+echo spec_path=$(pwd)/../paddle/fluid/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
+echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
+echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/../paddle/fluid/UNITTEST_DEV.spec $(pwd)/../paddle/fluid/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
+echo if [ "$unittest_spec_diff" != "" ]; then>>  check_change_of_unittest.sh
+echo     # approval_user_list: XiaoguangHu01 46782768,luotao1 6836917,phlrain 43953930,lanxianghit 47554610, zhouwei25 52485244, kolinwei 22165420>>  check_change_of_unittest.sh
+echo     approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>>  check_change_of_unittest.sh
+echo     set +x>>  check_change_of_unittest.sh
+echo     if [ "$approval_line" != "" ]; then>>  check_change_of_unittest.sh
+echo         APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>>  check_change_of_unittest.sh
+echo         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">>  check_change_of_unittest.sh
+echo         if [ "${APPROVALS}" == "FALSE" ]; then>>  check_change_of_unittest.sh
+echo             echo "************************************"                >>  check_change_of_unittest.sh
+echo             echo -e "It is forbidden to disable or delete the unit-test.\n"        >>  check_change_of_unittest.sh
+echo             echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]."     >>  check_change_of_unittest.sh
+echo             echo -e "Then you must have one RD (kolinwei(recommended) or zhouwei25) approval for the deletion of unit-test. \n"                 >>  check_change_of_unittest.sh
+echo             echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"   >>  check_change_of_unittest.sh
+echo             echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n"     >>  check_change_of_unittest.sh
+echo             echo "************************************"                >>  check_change_of_unittest.sh
+echo             exit 1 >>  check_change_of_unittest.sh
+echo          fi>>  check_change_of_unittest.sh
+echo     else>>  check_change_of_unittest.sh
+echo          exit 1 >>  check_change_of_unittest.sh
+echo     fi>>  check_change_of_unittest.sh
+echo fi>>  check_change_of_unittest.sh
+echo git checkout origin_pr >>  check_change_of_unittest.sh
+d:\.cache\tools\busybox64.exe bash check_change_of_unittest.sh
+goto:eof
+
+:check_change_of_unittest_error
+exit /b %ERRORLEVEL%
+
+
+rem ---------------------------------------------------------------------------------------------
+:success
+echo    ========================================
+echo    Clean up environment  at the end ...
+echo    ========================================
+taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im msbuild.exe 2>NUL
+taskkill /f /im git.exe 2>NUL
+taskkill /f /im cl.exe 2>NUL
+taskkill /f /im lib.exe 2>NUL
+taskkill /f /im link.exe 2>NUL
+taskkill /f /im git-remote-https.exe 2>NUL
+taskkill /f /im vctip.exe 2>NUL
+taskkill /f /im cvtres.exe 2>NUL
+taskkill /f /im rc.exe 2>NUL
+echo Windows CI run successfully!
+exit /b 0
+
+ENDLOCAL
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 72a792002b31e..0b6b006bbb244 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -64,6 +64,9 @@ function cmake_base() {
     # Delete previous built whl packages
     rm -rf python/dist 2>/dev/null || true
 
+    # `gym` is only used in unittest, it's not suitable to add in requirements.txt.
+    # Add it dynamically.
+    echo "gym" >> ${PADDLE_ROOT}/python/requirements.txt
     # Support build for all python versions, currently
     # including cp27-cp27m and cp27-cp27mu.
     PYTHON_FLAGS=""
@@ -119,6 +122,8 @@ function cmake_base() {
                 exit 1
             fi
         fi
+        # delete `gym` to avoid modifying requirements.txt in *.whl
+        sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
     else
         if [ "$1" != "" ]; then
             echo "using python abi: $1"
@@ -175,6 +180,8 @@ function cmake_base() {
         else
             pip install -r ${PADDLE_ROOT}/python/requirements.txt
         fi
+        # delete `gym` to avoid modifying requirements.txt in *.whl
+        sed -i "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
     fi
 
     if [ "$SYSTEM" == "Darwin" ]; then
@@ -213,6 +220,7 @@ function cmake_base() {
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
         -DWITH_GRPC=${grpc_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
+        -DLITE_GIT_TAG=develop
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -241,6 +249,7 @@ EOF
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
         -DWITH_GRPC=${grpc_flag} \
+        -DLITE_GIT_TAG=develop \
         -DWITH_LITE=${WITH_LITE:-OFF};build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
@@ -370,6 +379,7 @@ function cmake_gen_and_build() {
 }
 
 function build_mac() {
+    set +e
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     cat <<EOF
@@ -380,7 +390,11 @@ EOF
     if [[ "$ENABLE_MAKE_CLEAN" != "OFF" ]]; then
         make clean
     fi
-    make install -j 8
+    make install -j 8;build_error=$?
+    if [ "$build_error" != 0 ];then
+        exit 7;
+    fi
+    set -e
     build_size
 }
 
@@ -570,6 +584,10 @@ function generate_api_spec() {
     op_desc_path=${PADDLE_ROOT}/paddle/fluid/OP_DESC_${spec_kind}.spec
     python ${PADDLE_ROOT}/tools/print_op_desc.py > $op_desc_path
 
+    # print api and the md5 of source code of the api.
+    api_source_md5_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.source.md5
+    python ${PADDLE_ROOT}/tools/count_api_without_core_ops.py -p paddle > $api_source_md5_path
+
     awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
     awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
     if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then 
@@ -1269,10 +1287,10 @@ function example() {
     pip install ${PADDLE_ROOT}/build/python/dist/*.whl
     paddle version
     cd ${PADDLE_ROOT}/tools
-    python sampcd_processor.py cpu 
-    if [ "$?" != "0" ];then
+    python sampcd_processor.py cpu;example_error=$?
+    if [ "$example_error" != "0" ];then
       echo "Code instance execution failed"
-      exit 1
+      exit 5
     fi
 }
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 2eed69c9df6be..0d572599a6678 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -132,6 +132,7 @@
 from .tensor.math import atan  #DEFINE_ALIAS
 from .tensor.math import ceil  #DEFINE_ALIAS
 from .tensor.math import cos  #DEFINE_ALIAS
+from .tensor.math import cosh  #DEFINE_ALIAS
 from .tensor.math import cumsum  #DEFINE_ALIAS
 from .tensor.math import elementwise_add  #DEFINE_ALIAS
 from .tensor.math import elementwise_div  #DEFINE_ALIAS
@@ -139,14 +140,12 @@
 from .tensor.math import elementwise_max  #DEFINE_ALIAS
 from .tensor.math import elementwise_min  #DEFINE_ALIAS
 from .tensor.math import elementwise_mod  #DEFINE_ALIAS
-from .tensor.math import elementwise_mul  #DEFINE_ALIAS
 from .tensor.math import elementwise_pow  #DEFINE_ALIAS
 from .tensor.math import elementwise_sub  #DEFINE_ALIAS
 from .tensor.math import exp  #DEFINE_ALIAS
 from .tensor.math import floor  #DEFINE_ALIAS
 from .tensor.math import increment  #DEFINE_ALIAS
 from .tensor.math import log  #DEFINE_ALIAS
-from .tensor.math import mul  #DEFINE_ALIAS
 from .tensor.math import multiplex  #DEFINE_ALIAS
 from .tensor.math import pow  #DEFINE_ALIAS
 from .tensor.math import reciprocal  #DEFINE_ALIAS
@@ -159,6 +158,7 @@
 from .tensor.math import scale  #DEFINE_ALIAS
 from .tensor.math import sign  #DEFINE_ALIAS
 from .tensor.math import sin  #DEFINE_ALIAS
+from .tensor.math import sinh  #DEFINE_ALIAS
 from .tensor.math import sqrt  #DEFINE_ALIAS
 from .tensor.math import square  #DEFINE_ALIAS
 from .tensor.math import stanh  #DEFINE_ALIAS
@@ -170,6 +170,7 @@
 from .tensor.math import min  #DEFINE_ALIAS
 from .tensor.math import mm  #DEFINE_ALIAS
 from .tensor.math import div  #DEFINE_ALIAS
+from .tensor.math import multiply  #DEFINE_ALIAS
 from .tensor.math import add  #DEFINE_ALIAS
 from .tensor.math import atan  #DEFINE_ALIAS
 from .tensor.math import logsumexp  #DEFINE_ALIAS
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 0bfd75b499440..7c8fa257f778e 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -381,7 +381,7 @@ def start_local_trainers(cluster,
         tp.rank = t.rank
         tp.local_rank = idx
         tp.log_fn = fn
-        tp.log_offset = 0 if fn else None
+        tp.log_offset = fn.tell() if fn else None
         tp.cmd = cmd
 
         procs.append(tp)
diff --git a/python/paddle/fleet/__init__.py b/python/paddle/fleet/__init__.py
index a5a8d12ed4400..b25c362ce9301 100644
--- a/python/paddle/fleet/__init__.py
+++ b/python/paddle/fleet/__init__.py
@@ -14,10 +14,29 @@
 
 # TODO: define distributed api under this directory, 
 from .base.distributed_strategy import DistributedStrategy
-#from .base.role_maker import PaddleCloudRoleMaker, UserDefinedRoleMaker
-#from .base.fleet_base import Fleet
+from .base.fleet_base import Fleet
+from .base.util_factory import UtilBase
 
-#__all__ = [
-#    "DistributedStrategy", "PaddleCloudRoleMaker", "UserDefinedRoleMaker"
-#]
-__all__ = ['DistributedStrategy']
+#from .base.role_maker import PaddleCloudRoleMaker
+
+__all__ = ["DistributedStrategy", "UtilBase"]
+
+fleet = Fleet()
+init = fleet.init
+is_first_worker = fleet.is_first_worker
+worker_index = fleet.worker_index
+worker_num = fleet.worker_num
+is_worker = fleet.is_worker
+worker_endpoints = fleet.worker_endpoints
+server_num = fleet.server_num
+server_index = fleet.server_index
+server_endpoints = fleet.server_endpoints
+is_server = fleet.is_server
+util = fleet.util
+barrier_worker = fleet.barrier_worker
+init_worker = fleet.init_worker
+init_server = fleet.init_server
+run_server = fleet.run_server
+stop_worker = fleet.stop_worker
+distributed_optimizer = fleet.distributed_optimizer
+minimize = fleet.minimize
diff --git a/python/paddle/fleet/base/distributed_strategy.py b/python/paddle/fleet/base/distributed_strategy.py
index 0ebaff3a0f70c..fdc5b22ae4c62 100644
--- a/python/paddle/fleet/base/distributed_strategy.py
+++ b/python/paddle/fleet/base/distributed_strategy.py
@@ -14,6 +14,7 @@
 
 from paddle.fleet.proto import distributed_strategy_pb2
 from paddle.fluid.framework import Variable
+import google.protobuf.text_format
 
 
 class DistributedJobInfo(object):
@@ -57,6 +58,15 @@ class DistributedStrategy(object):
     def __init__(self):
         self.strategy = distributed_strategy_pb2.DistributedStrategy()
 
+    def save_to_prototxt(self, output):
+        with open(output, "w") as fout:
+            fout.write(str(self.strategy))
+
+    def load_from_prototxt(self, pb_file):
+        f = open(pb_file, 'r')
+        self.strategy = google.protobuf.text_format.Merge(
+            str(f.read()), self.strategy)
+
     @property
     def amp(self):
         return self.strategy.amp
@@ -189,6 +199,19 @@ def hierachical_allreduce(self, flag):
             print(
                 "WARNING: hierachical_allreduce should have value of bool type")
 
+    @property
+    def hierachical_allreduce_inter_ranks(self):
+        return self.strategy.hierachical_allreduce_inter_ranks
+
+    @hierachical_allreduce_inter_ranks.setter
+    def hierachical_allreduce_inter_ranks(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.hierachical_allreduce_inter_ranks = flag
+        else:
+            print(
+                "WARNING: hierachical_allreduce_inter_ranks should have value of bool type"
+            )
+
     @property
     def nccl_comm_num(self):
         return self.strategy.nccl_comm_num
@@ -235,6 +258,17 @@ def sequential_execution(self, flag):
             print(
                 "WARNING: sequential_execution should have value of bool type")
 
+    @property
+    def sync_nccl_allreduce(self):
+        return self.strategy.sync_nccl_allreduce
+
+    @sync_nccl_allreduce.setter
+    def sync_nccl_allreduce(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.sync_nccl_allreduce = flag
+        else:
+            print("WARNING: sync_nccl_allreduce should have avlue of bool type")
+
     @property
     def lars(self):
         return self.strategy.lars
@@ -305,6 +339,17 @@ def fuse_relu_depthwise_conv(self, flag):
                 "WARNING: fuse_relu_depthwise_conv should have value of bool type"
             )
 
+    @property
+    def fuse_broadcast_ops(self):
+        return self.strategy.fuse_broadcast_ops
+
+    @fuse_broadcast_ops.setter
+    def fuse_broadcast_ops(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.fuse_broadcast_ops = flag
+        else:
+            print("WARNING: fuse_broadcast_ops should have value of bool type")
+
     @property
     def enable_inplace(self):
         return self.strategy.enable_inplace
@@ -340,6 +385,18 @@ def num_iteration_per_drop_scope(self, flag):
                 "WARNING: num_iteration_per_drop_scope should have value of int type"
             )
 
+    @property
+    def num_iteration_per_run(self):
+        return self.strategy.num_iteration_per_run
+
+    @num_iteration_per_run.setter
+    def num_iteration_per_run(self, value):
+        if isinstance(value, int):
+            self.strategy.num_iteration_per_run = value
+        else:
+            print(
+                "WARNING: num_iteration_per_run should have value of int type")
+
     @property
     def sync_batch_norm(self):
         return self.strategy.sync_batch_norm
@@ -499,6 +556,17 @@ def elastic(self, flag):
         else:
             print("WARNING: elastic should have value of bool type")
 
+    @property
+    def num_threads(self):
+        return self.strategy.num_threads
+
+    @num_threads.setter
+    def num_threads(self, value):
+        if isinstance(value, int):
+            self.strategy.num_threads = value
+        else:
+            print("WARNING: num_threads should have value of int type")
+
     @property
     def auto(self):
         return self.strategy.auto
diff --git a/python/paddle/fleet/base/fleet_base.py b/python/paddle/fleet/base/fleet_base.py
index 881044006479e..13b9fc3220a09 100644
--- a/python/paddle/fleet/base/fleet_base.py
+++ b/python/paddle/fleet/base/fleet_base.py
@@ -13,7 +13,331 @@
 # limitations under the License.
 
 from __future__ import print_function
-from paddle.fleet import RoleMakerBase
-from . import obj_creator
+import paddle
+from .strategy_compiler import StrategyCompiler
+from .meta_optimizer_factory import MetaOptimizerFactory
+from .runtime_factory import RuntimeFactory
+from .util_factory import UtilFactory
 
-# __all__ = ['Fleet']
+__all__ = ['Fleet']
+
+
+class Fleet(object):
+    """
+    Unified API for distributed training of PaddlePaddle
+    Please reference the https://github.com/PaddlePaddle/Fleet for details
+
+
+    Returns:
+        Fleet: A Fleet instance
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fleet as fleet
+            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            fleet.init(role)
+            strategy = fleet.DistributedStrategy()
+            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+            if fleet.is_first_worker():
+                print("this is first worker")
+            print("current node index: {}".format(fleet.worker_index()))
+            print("total number of worker num: {}".format(fleet.worker_num()))
+            if fleet.is_worker():
+                print("this is worker")
+            print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True)))
+            print("server num: {}".format(fleet.server_num()))
+            print("server endpoints: {}".format(fleet.server_endpoints(to_string=True)))
+            if fleet.is_server():
+                print("this is server")
+            fleet.stop_worker()
+    """
+
+    def __init__(self):
+        self._runtime_handle = None
+        self._util = None
+
+    def init(self, role_maker):
+        self._role_maker = role_maker
+        self.strategy_compiler = StrategyCompiler()
+
+    def is_first_worker(self):
+        """
+        Check whether the node is the first instance of worker.
+
+        Returns:
+            bool: True if this is the first node of worker,
+                  False if not.
+        
+        """
+        return self._role_maker.is_first_worker()
+
+    def worker_index(self):
+        """
+        Get current worker index.
+
+        Returns:
+            int: node id
+        """
+        return self._role_maker.worker_index()
+
+    def worker_num(self):
+        """
+        Get current total worker number.
+
+        Returns:
+            int: worker numbers
+        """
+        return self._role_maker.worker_num()
+
+    def is_worker(self):
+        """
+        Check whether the node is an instance of worker.
+
+        Returns:
+            bool: True if this is a node of worker,
+                  False if not.
+        """
+        return self._role_maker.is_worker()
+
+    def worker_endpoints(self, to_string=False):
+        """
+        Get current server endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
+
+        Returns:
+            list/string: server endpoints
+        """
+        '''
+        if to_string:
+            return ",".join(self._role_maker.get_trainer_endpoints())
+        else:
+            return self._role_maker.get_trainer_endpoints()
+        '''
+        return ["127.0.0.1:1001", "127.0.0.1:1002"]
+
+    def server_num(self):
+        """
+        Get current total worker number.
+
+        Returns:
+            int: server number
+        """
+        return len(self._role_maker.get_pserver_endpoints())
+
+    def server_index(self):
+        """
+        Get current server index.
+
+        Returns:
+            int: node id
+        """
+        return self._role_maker.server_index()
+
+    def server_endpoints(self, to_string=False):
+        """
+        Get current server endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
+
+        Returns:
+            list/string: server endpoints
+        """
+        '''
+        if to_string:
+            return ",".join(self._role_maker.get_pserver_endpoints())
+        else:
+            return self._role_maker.get_pserver_endpoints()
+        '''
+        return ["127.0.0.1:1001", "127.0.0.1:1002"]
+
+    def is_server(self):
+        """
+        Check whether the node is an instance of server.
+
+        Returns:
+            bool: True if this is a node of server,
+                  False if not.
+        """
+        return self._role_maker.is_server()
+
+    @property
+    def util(self):
+        """
+        Utility functions that can be used under certain runtime
+        return util
+        """
+        return self._util
+
+    @util.setter
+    def util(self, util):
+        """
+        Set Utility functions for userd-defined runtime
+        set util
+        """
+        self._util = util
+
+    def barrier_worker(self):
+        """
+        barrier between workers
+        """
+        self._role_maker.barrier_worker()
+
+    def init_worker(self):
+        """
+        init worker
+        """
+        assert self._runtime_handle is not None
+        self._runtime_handle._init_worker()
+
+    def init_server(self, model_dir=None):
+        """
+        init server
+        """
+        assert self._runtime_handle is not None
+        self._runtime_handle._init_server()
+
+    def run_server(self):
+        """
+        run server
+        """
+        assert self._runtime_handle is not None
+        self._runtime_handle._run_server()
+
+    def stop_worker(self):
+        """
+        stop worker
+        """
+        assert self._runtime_handle is not None
+        self._runtime_handle._stop_worker()
+
+    def distributed_optimizer(self, optimizer, strategy):
+        """
+        distirbuted_optimizer
+        Returns:
+            Fleet instance with minimize interface like optimizers
+
+        Examples:
+            .. code-block:: python
+            import paddle.fleet as fleet
+            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            fleet.init(role)
+            strategy = fleet.DistributedStrategy()
+            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        """
+        self.user_defined_optimizer = optimizer
+        self.user_defined_strategy = strategy
+        return self
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        """
+        Add distributed operations to minimize ``loss`` by updating ``parameter_list``.
+
+        Args:
+            loss (Variable): A ``Variable`` containing the value to minimize.
+            startup_program (Program, optional): :ref:`api_fluid_Program` for
+                initializing parameters in ``parameter_list``. The default value
+                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
+            parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update
+                to minimize ``loss``. The default value is None, at this time all parameters
+                will be updated.
+            no_grad_set (set, optional): Set of ``Variable``  or ``Variable.name`` that don't need
+                to be updated. The default value is None.
+
+        Returns:
+            tuple: tuple (optimize_ops, params_grads), A list of operators appended
+            by minimize and a list of (param, grad) variable pairs, param is
+            ``Parameter``, grad is the gradient value corresponding to the parameter.
+            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
+            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            ``fetch_list`` before run, see details in ``Executor``.
+
+        Examples:
+            import paddle
+            import paddle.fleet as fleet
+            import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+
+            fc_1 = paddle.layers.fc(input=input_x, size=hid_dim, act='tanh')
+            fc_2 = paddlen.layers.fc(input=fc_1, size=hid_dim, act='tanh')
+            prediction = paddle.layers.fc(input=[fc_2], size=label_dim, act='softmax')
+            cost = paddle.layers.cross_entropy(input=prediction, label=input_y)
+            avg_cost = paddle.layers.mean(x=cost)
+
+            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            fleet.init(role)
+            strategy = fleet.DistributedStrategy()
+            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+            # for more examples, please reference https://github.com/PaddlePaddle/Fleet
+
+        """
+        # cache original feed forward program
+        self.origin_main_program = loss.block.program
+        if startup_program == None:
+            self.origin_startup_program = \
+                paddle.default_startup_program().clone(for_test=False)
+            startup_program = paddle.default_startup_program()
+        else:
+            self.origin_startup_program = \
+                startup_program.clone(for_test=False)
+
+        # compile time
+        distributed_optimizer_list = \
+            MetaOptimizerFactory()._get_valid_meta_optimizers(
+                self.user_defined_optimizer)
+        valid_optimizer_list = []
+        valid_graph_optimizer_list = []
+        # recall meta optimizers for ranking
+        for opt in distributed_optimizer_list:
+            opt._set_basic_info(loss, self._role_maker,
+                                self.user_defined_optimizer,
+                                self.user_defined_strategy)
+            if opt._can_apply() and not opt._is_graph_out():
+                valid_optimizer_list.append(opt)
+            if opt._can_apply() and opt._is_graph_out():
+                valid_graph_optimizer_list.append(opt)
+        # combine recalled meta optimizers to be a valid meta optimizer
+        meta_optimizer, graph_optimizer, final_dist_strategy = \
+                self.strategy_compiler.generate_optimizer(
+                    loss, self._role_maker, self.user_defined_optimizer,
+                    self.user_defined_strategy, valid_optimizer_list,
+                    valid_graph_optimizer_list)
+
+        optimize_ops = []
+        params_grads = []
+        if meta_optimizer:
+            optimize_ops, params_grads = meta_optimizer.minimize(
+                loss,
+                startup_program=startup_program,
+                parameter_list=parameter_list,
+                no_grad_set=no_grad_set)
+
+        if graph_optimizer:
+            optimizer_ops, params_grads = graph_optimizer.minimize(
+                loss,
+                startup_program=startup_program,
+                parameter_list=parameter_list,
+                no_grad_set=no_grad_set)
+            # since we do not encourage users to use graph operations
+            # if a graph optimizer takes effect, mostly
+            # optimizers_ops and params_grads are None
+            # i.e. users can not modify current computation graph anymore
+
+        if self._runtime_handle is None:
+            self._runtime_handle = RuntimeFactory()._create_runtime(
+                final_dist_strategy, self._role_maker, optimize_ops,
+                params_grads)
+
+        if self._util is None:
+            self._util = UtilFactory()._create_util(final_dist_strategy,
+                                                    self._role_maker,
+                                                    optimize_ops, params_grads)
+
+        return optimize_ops, params_grads
diff --git a/python/paddle/fleet/base/meta_optimizer_factory.py b/python/paddle/fleet/base/meta_optimizer_factory.py
new file mode 100644
index 0000000000000..8d42c2a0c89ef
--- /dev/null
+++ b/python/paddle/fleet/base/meta_optimizer_factory.py
@@ -0,0 +1,31 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..meta_optimizers import RecomputeOptimizer
+from ..meta_optimizers import GraphExecutionOptimizer
+
+__all__ = ["MetaOptimizerFactory"]
+
+meta_optimizer_names = ["RecomputeOptimizer", "GraphExecutionOptimizer"]
+
+
+class MetaOptimizerFactory(object):
+    def __init__(self):
+        pass
+
+    def _get_valid_meta_optimizers(self, user_defined_optimizer):
+        opt_list = []
+        for opt_name in meta_optimizer_names:
+            opt_list.append(globals()[opt_name](user_defined_optimizer))
+        return opt_list
diff --git a/python/paddle/fleet/base/private_helper_function.py b/python/paddle/fleet/base/private_helper_function.py
new file mode 100644
index 0000000000000..6b3232b93b224
--- /dev/null
+++ b/python/paddle/fleet/base/private_helper_function.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import time
+import socket
+from contextlib import closing
+from six import string_types
+
+
+def wait_server_ready(endpoints):
+    """
+    Wait until parameter servers are ready, use connext_ex to detect
+    port readiness.
+    
+    Args:
+    endpoints (list): endpoints string list, like:
+    ["127.0.0.1:8080", "127.0.0.1:8081"]
+    
+    Examples:
+    .. code-block:: python
+
+         wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
+    """
+    assert not isinstance(endpoints, str)
+    while True:
+        all_ok = True
+        not_ready_endpoints = []
+        for ep in endpoints:
+            ip_port = ep.split(":")
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as sock:
+                sock.settimeout(2)
+                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                if result != 0:
+                    all_ok = False
+                    not_ready_endpoints.append(ep)
+        if not all_ok:
+            sys.stderr.write("server not ready, wait 3 sec to retry...\n")
+            sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
+                             "\n")
+            sys.stderr.flush()
+            time.sleep(3)
+        else:
+            break
diff --git a/python/paddle/fleet/base/runtime_factory.py b/python/paddle/fleet/base/runtime_factory.py
new file mode 100644
index 0000000000000..c4d42db4ea993
--- /dev/null
+++ b/python/paddle/fleet/base/runtime_factory.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..runtime.collective_runtime import CollectiveRuntime
+
+
+class RuntimeFactory(object):
+    def __init__(self):
+        pass
+
+    def _create_runtime(self, final_dist_strategy, role_maker, opt_ops,
+                        params_grads):
+        if role_maker._is_collective:
+            collective_runtime = CollectiveRuntime()
+            collective_runtime._set_basic_info(final_dist_strategy, role_maker,
+                                               opt_ops, params_grads)
+            return collective_runtime
diff --git a/python/paddle/fleet/base/strategy_compiler.py b/python/paddle/fleet/base/strategy_compiler.py
new file mode 100644
index 0000000000000..92b50781f65ba
--- /dev/null
+++ b/python/paddle/fleet/base/strategy_compiler.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def maximum_path_len_algo(optimizer_list):
+    max_idx = 0
+    max_len = 0
+    candidates = []
+    for idx, opt in enumerate(optimizer_list):
+        local_buffer = [opt]
+        for opt_inner in optimizer_list:
+            if opt._can_update(opt_inner):
+                local_buffer.append(opt_inner)
+        if len(local_buffer) > max_len:
+            max_idx = idx
+            max_len = len(local_buffer)
+        candidates.append(local_buffer)
+    if len(candidates) == 0:
+        return None
+    for idx, opt in enumerate(candidates[max_idx][:-1]):
+        opt._update_inner_optimizer(candidates[max_idx][idx + 1])
+    return candidates[max_idx][0]
+
+
+class StrategyCompilerBase(object):
+    def __init__(self):
+        pass
+
+
+class StrategyCompiler(StrategyCompilerBase):
+    """
+    StrategyCompiler is responsible for meta optimizers combination
+    Generally, a user can define serveral distributed strategies that
+    can generate serveral meta optimizer. The combination of these 
+    meta optimizers should have the right order to apply the optimizers'
+    minimize function.
+    This class is responsible for the executable distributed optimizer
+    generation.
+    """
+
+    def __init__(self):
+        super(StrategyCompiler, self).__init__()
+
+    def generate_optimizer(self, loss, role_maker, optimizer,
+                           userd_defined_strategy, meta_optimizer_list,
+                           graph_optimizer_list):
+        if len(meta_optimizer_list) == 0 and len(graph_optimizer_list) == 0:
+            return optimizer, None
+        else:
+            # currently, we use heuristic algorithm to select
+            # meta optimizers combinations
+            meta_optimizer = maximum_path_len_algo(meta_optimizer_list)
+            graph_optimizer = maximum_path_len_algo(graph_optimizer_list)
+            # should design a distributed strategy update interface
+            # when we have finally decided the combination of meta_optimizer
+            # and graph_optimizer, the corresponding distributed strategy
+            # should be updated.
+            return meta_optimizer, graph_optimizer, None
diff --git a/python/paddle/fleet/base/util_base.py b/python/paddle/fleet/base/util_factory.py
similarity index 71%
rename from python/paddle/fleet/base/util_base.py
rename to python/paddle/fleet/base/util_factory.py
index 7654d0bcd9cd6..74029f43d10c8 100644
--- a/python/paddle/fleet/base/util_base.py
+++ b/python/paddle/fleet/base/util_factory.py
@@ -16,13 +16,30 @@
 """basic collective operations in python"""
 """remote file system"""
 
-# __all__ = ['UtilBase']
-'''
+__all__ = ['UtilBase']
+
+
+class UtilFactory(object):
+    def _create_util(self, dist_strategy, role_maker, optimize_ops,
+                     params_grads):
+        util = UtilBase()
+        util._set_strategy(dist_strategy)
+        util._set_role_maker(role_maker)
+        return util
+
+
 class UtilBase(object):
-    def __init__(self, role_maker, fleet_obj):
-        self.role_maker = roke_maker
-        self.fleet_obj = fleet_obj
+    def __init__(self):
+        self.role_maker = None
+        self.dist_strategy = None
+
+    def _set_strategy(self, dist_strategy):
+        self.dist_strategy = dist_strategy
+
+    def _set_role_maker(self, role_maker):
+        self.role_maker = role_maker
 
+    '''
     def set_file_system(self, fs_client):
         self.fs_client = fs_client
 
@@ -61,4 +78,4 @@ def save_var():
 
     def print_on_rank(self):
         pass
-'''
+    '''
diff --git a/python/paddle/fleet/collective/__init__.py b/python/paddle/fleet/meta_optimizers/__init__.py
similarity index 79%
rename from python/paddle/fleet/collective/__init__.py
rename to python/paddle/fleet/meta_optimizers/__init__.py
index 8647330f3290f..8a87a31e90389 100644
--- a/python/paddle/fleet/collective/__init__.py
+++ b/python/paddle/fleet/meta_optimizers/__init__.py
@@ -10,3 +10,8 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
+
+from .recompute_optimizer import RecomputeOptimizer
+from .graph_execution_optimizer import GraphExecutionOptimizer
+
+__all__ = ['RecomputeOptimizer']
diff --git a/python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py
new file mode 100644
index 0000000000000..cc3d1cd2128bd
--- /dev/null
+++ b/python/paddle/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -0,0 +1,194 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import paddle
+from paddle.fluid.framework import core
+from paddle.fluid import compiler
+from .meta_optimizer_base import MetaOptimizerBase
+from ..base.private_helper_function import wait_server_ready
+
+
+def get_build_strategy(dist_strategy):
+    build_strategy = paddle.BuildStrategy()
+    build_strategy.enable_sequential_execution = \
+        dist_strategy.sequential_execution
+    build_strategy.remove_unnecessary_lock = True
+    build_strategy.fuse_elewise_add_act_ops = \
+        dist_strategy.fuse_elewise_add_act_ops
+    build_strategy.fuse_bn_act_ops = \
+        dist_strategy.fuse_bn_act_ops
+    build_strategy.enable_auto_fusion = \
+        dist_strategy.enable_auto_fusion
+    build_strategy.fuse_relu_depthwise_conv = \
+        dist_strategy.fuse_relu_depthwise_conv
+    build_strategy.fuse_broadcast_ops = \
+        dist_strategy.fuse_broadcast_ops
+    build_strategy.sync_batch_norm = \
+        dist_strategy.sync_batch_norm
+    return build_strategy
+
+
+def get_execution_strategy(dist_strategy):
+    execution_strategy = paddle.ExecutionStrategy()
+    execution_strategy.num_threads = \
+        dist_strategy.num_threads
+    execution_strategy.num_iteration_per_drop_scope = \
+        dist_strategy.num_iteration_per_drop_scope
+    execution_strategy.num_iteration_per_run = \
+        dist_strategy.num_iteration_per_run
+    execution_strategy.use_thread_barrier = \
+        dist_strategy.use_thread_barrier
+    return execution_strategy
+
+
+class GraphExecutionOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(GraphExecutionOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = []
+
+    def _is_graph_out(self):
+        return True
+
+    def _can_apply(self):
+        """
+        Basically, this is PE, and almost all programs can be executed here
+        """
+        return True
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        pass
+
+    # should fix the variable 
+    def _setup_nccl_op(self, startup_program, main_program):
+        trainer_endpoints = self.role_maker.get_trainer_endpoints()
+        trainer_id = self.role_maker.worker_index()
+        current_endpoint = self.role_maker.get_trainer_endpoints()[trainer_id]
+        trainer_endpoints_env = ",".join(trainer_endpoints)
+        trainers_num = self.role_maker.worker_num()
+        if trainer_id == 0:
+            other_trainer_endpoints = trainer_endpoints[:]
+            other_trainer_endpoints.remove(current_endpoint)
+            wait_server_ready(other_trainer_endpoints)
+        nccl_id_var = startup_program.global_block().create_var(
+            name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
+        for i in range(1, self.user_defined_strategy.nccl_comm_num):
+            startup_program.global_block().create_var(
+                name="NCCLID_{}".format(i),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+
+        if self.user_defined_strategy.hierachical_allreduce:
+            for i in range(0, self.user_defined_strategy.nccl_comm_num):
+                startup_program.global_block().create_var(
+                    name="Hierarchical_inter_NCCLID_{}".format(i),
+                    persistable=True,
+                    type=core.VarDesc.VarType.RAW)
+                startup_program.global_block().create_var(
+                    name="Hierarchical_exter_NCCLID_{}".format(i),
+                    persistable=True,
+                    type=core.VarDesc.VarType.RAW)
+
+        startup_program.global_block().append_op(
+            type="gen_nccl_id",
+            inputs={},
+            outputs={"NCCLID": nccl_id_var},
+            attrs={
+                "trainers": trainer_endpoints,
+                "trainer_id": trainer_id,
+                "nccl_comm_num": self.user_defined_strategy.nccl_comm_num,
+                "use_hierarchical_allreduce":
+                self.user_defined_strategy.hierachical_allreduce,
+                "hierarchical_allreduce_inter_ranks":
+                self.user_defined_strategy.hierachical_allreduce_inter_ranks
+            })
+
+    def _try_to_compile(self, startup_program, main_program, loss):
+        build_strategy = get_build_strategy(self.user_defined_strategy)
+        exe_strategy = get_execution_strategy(self.user_defined_strategy)
+        node_num = self.role_maker.worker_num()
+        if self.role_maker._is_collective:
+            assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num
+
+        if node_num <= 1:
+            # local mode
+            if self.user_defined_strategy.nccl_comm_num > 1:
+                logging.warn("set nccl_comm_num=1 since you only have 1 node.")
+            self.user_defined_strategy.nccl_comm_num = 1
+
+            if self.user_defined_strategy.hierachical_allreduce:
+                logging.warn(
+                    "set hierachical_allreduce=False since you only have 1 node."
+                )
+            self.user_defined_strategy.hierachical_allreduce = False
+
+        sync_allreduce = self.user_defined_strategy.sync_nccl_allreduce
+        if sync_allreduce:
+            exe_strategy.num_threads = self.user_defined_strategy.nccl_comm_num + 1
+            if self.user_defined_strategy.hierachical_allreduce:
+                exe_strategy.num_threads = 2 * self.user_defined_strategy.nccl_comm_num + 1
+            if exe_strategy.num_threads > 4:
+                logging.warn(
+                    "if you use hierachical_allreduce or "
+                    "with multi nccl comm, please export FLAGS_sync_nccl_allreduce = 0"
+                )
+
+        # TODO(guru4elephant): should be an independent optimizer
+        sync_batch_norm = self.user_defined_strategy.sync_batch_norm
+        if sync_batch_norm:
+            self.user_defined_strategy.nccl_comm_num = 1
+            self.user_defined_strategy.hierachical_allreduce = False
+            exe_strategy.num_threads = 1
+            logging.warn(
+                "use sync_batch_norm will hang when set num_threads > 1, so "
+                "set num_threads=1, nccl_comm_num=1, hierachical_allreduce=False."
+            )
+
+        # TODO(guru4elephant): should be an independent optimizer
+        self._setup_nccl_op(startup_program, main_program)
+
+        build_strategy.num_trainers = self.role_maker.worker_num()
+        build_strategy.trainer_id = self.role_maker.worker_index()
+        build_strategy.trainers_endpoints = self.role_maker.get_trainer_endpoints(
+        )
+        build_strategy.enable_backward_optimizer_op_deps = True
+
+        self._compiled_program = compiler.CompiledProgram(main_program)
+
+        self._compiled_program.with_data_parallel(
+            loss_name=loss.name,
+            build_strategy=build_strategy,
+            exec_strategy=exe_strategy,
+            share_vars_from=None)
+
+        return self._compiled_program
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        if startup_program == None:
+            startup_program = paddle.default_startup_program()
+        compiled_program = self._try_to_compile(startup_program,
+                                                loss.block.program, loss)
+        loss.block.program.graph = compiled_program
+
+        # just return self.optimizer_ops and self.param_grads
+        return None, None
diff --git a/python/paddle/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/fleet/meta_optimizers/meta_optimizer_base.py
new file mode 100644
index 0000000000000..33b7b2bb1e852
--- /dev/null
+++ b/python/paddle/fleet/meta_optimizers/meta_optimizer_base.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["MetaOptimizerBase"]
+
+
+class MetaOptimizerBase(object):
+    def __init__(self, optimizer):
+        pass
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        self.loss = loss
+        self.role_maker = role_maker
+        self.user_defined_optimizer = user_defined_optimizer
+        self.user_defined_strategy = user_defined_strategy
+
+    def _update_inner_optimier(self, optimizer):
+        self.inner_opt = optimizer
+
+    def _can_apply(self):
+        return False
+
+    def _is_graph_out(self):
+        return False
+
+    def _can_update(self, optimizer):
+        if str(optimizer.__class__.__name__) in self.meta_optimizers_white_list:
+            return True
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        raise NotImplementedError("meta optimizer not implemented")
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        optimize_ops, params_grads = self.minimize_impl(
+            loss, startup_program, parameter_list, no_grad_set)
+        return optimize_ops, params_grads
diff --git a/python/paddle/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/fleet/meta_optimizers/recompute_optimizer.py
new file mode 100644
index 0000000000000..902b8367b34f6
--- /dev/null
+++ b/python/paddle/fleet/meta_optimizers/recompute_optimizer.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from paddle.fluid.optimizer import RecomputeOptimizer as RO
+from .meta_optimizer_base import MetaOptimizerBase
+
+__all__ = ["RecomputeOptimizer"]
+
+
+class RecomputeOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(RecomputeOptimizer, self).__init__(optimizer)
+        #self.inner_opt = RO(optimizer)
+        self.inner_opt = optimizer
+        self.wrapped_opt = RO(optimizer)
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = []
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(RecomputeOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        self.wrapped_opt._set_checkpoints([])
+
+    def _can_apply(self):
+        if self.user_defined_strategy.recompute == True:
+            if len(self.user_defined_strategy.recompute_checkpoints) == 0:
+                return False
+            else:
+                return True
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        return self.wrapped_opt.backward(loss, startup_program, parameter_list,
+                                         no_grad_set, callbacks)
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        optimize_ops, params_grads = \
+            self.wrapped_opt.minimize(loss, startup_program,
+                                      parameter_list, no_grad_set)
+        return optimize_ops, params_grads
diff --git a/python/paddle/fleet/metrics/metric.py b/python/paddle/fleet/metrics/metric.py
index 847ddc47ac891..83e0dd2e541c3 100644
--- a/python/paddle/fleet/metrics/metric.py
+++ b/python/paddle/fleet/metrics/metric.py
@@ -11,3 +11,375 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Fleet Metrics"""
+
+import paddle.fluid as fluid
+import math
+import numpy as np
+from paddle.fluid.framework import Variable
+from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet as fleet
+
+
+def sum(input, scope=None):
+    """
+    distributed sum in fleet
+
+    Args:
+        input(numpy.array|Variable|string): output of a layer
+        scope(Scope): specific scope
+
+    Returns:
+        global_metric(numpy.array): sum array
+
+    Example:
+        .. code-block:: python
+
+          # in model.py
+          input = fluid.layers.cast(some_input, dtype='float32')
+          cnt = fluid.layers.reduce_sum(input)
+          global_cnt = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
+          tmp = fluid.layers.elementwise_add(cnt, global_cnt)
+          fluid.layers.assign(tmp, global_cnt)
+          
+          # in train.py, after train or infer
+          res = np.array(scope.find_var(global_cnt.name).get_tensor())
+          print("sum array: ", paddle.fleet.sum(res))
+    """
+    fleet._role_maker._barrier_worker()
+    if scope is None:
+        scope = fluid.global_scope()
+    if isinstance(input, Variable):
+        input = np.array(scope.find_var(input.name).get_tensor())
+    elif isinstance(input, str):
+        input = np.array(scope.find_var(input).get_tensor())
+    old_shape = np.array(input.shape)
+    output = np.copy(input) * 0
+    fleet._role_maker._all_reduce(input, output, mode="sum")
+    output = output.reshape(old_shape)
+    return output
+
+
+def max(input, scope=None):
+    """
+    distributed max in fleet
+
+    Args:
+        input(numpy.array|Variable|string): output of a layer
+        scope(Scope): specific scope
+
+    Returns:
+        global_metric(numpy.array): max array
+
+    Example:
+        .. code-block:: python
+
+          # in model.py
+          input = fluid.layers.cast(some_input, dtype='float32')
+          cnt = fluid.layers.reduce_sum(input)
+          global_cnt = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
+          tmp = fluid.layers.elementwise_max(cnt, global_cnt)
+          fluid.layers.assign(tmp, global_cnt)
+
+          # in train.py, after train or infer
+          res = np.array(scope.find_var(global_cnt.name).get_tensor())
+          print("max array: ", paddle.fleet.max(res))
+    """
+    fleet._role_maker._barrier_worker()
+    if scope is None:
+        scope = fluid.global_scope()
+    if isinstance(input, Variable):
+        input = np.array(scope.find_var(input.name).get_tensor())
+    elif isinstance(input, str):
+        input = np.array(scope.find_var(input).get_tensor())
+    old_shape = np.array(input.shape)
+    output = np.copy(input) * 0
+    fleet._role_maker._all_reduce(input, output, mode="max")
+    output = output.reshape(old_shape)
+    return output
+
+
+def min(input, scope=None):
+    """
+    distributed min in fleet
+
+    Args:
+        input(numpy.array|Variable|string): output of a layer
+        scope(Scope): specific scope
+
+    Returns:
+        global_metric(numpy.array): min array
+
+    Example:
+        .. code-block:: python
+
+          # in model.py
+          input = fluid.layers.cast(some_input, dtype='float32')
+          cnt = fluid.layers.reduce_sum(input)
+          global_cnt = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
+          tmp = fluid.layers.elementwise_min(cnt, global_cnt)
+          fluid.layers.assign(tmp, global_cnt)
+
+          # in train.py, after train or infer
+          res = np.array(scope.find_var(global_cnt.name).get_tensor())
+          print("min array: ", paddle.fleet.min(res))
+    """
+    fleet._role_maker._barrier_worker()
+    if scope is None:
+        scope = fluid.global_scope()
+    if isinstance(input, Variable):
+        input = np.array(scope.find_var(input.name).get_tensor())
+    elif isinstance(input, str):
+        input = np.array(scope.find_var(input).get_tensor())
+    old_shape = np.array(input.shape)
+    output = np.copy(input) * 0
+    fleet._role_maker._all_reduce(input, output, mode="min")
+    output = output.reshape(old_shape)
+    return output
+
+
+def auc(stat_pos, stat_neg, scope=None):
+    """
+    distributed auc in fleet
+
+    Args:
+        stat_pos(numpy.array|Variable|string): stat_pos in output of fluid.layers.auc
+        stat_neg(numpy.array|Variable|string): stat_neg in output of fluid.layers.auc
+        scope(Scope): specific scope
+
+    Returns:
+        auc_value(float): auc value
+
+    Example:
+        .. code-block:: python
+
+          # in model.py
+          similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(output, min=-15.0, max=15.0))
+          binary_predict = fluid.layers.concat(
+              input=[fluid.layers.elementwise_sub(fluid.layers.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1)
+          self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] =
+              fluid.layers.auc(input=binary_predict, label=label, curve='ROC', num_thresholds=4096)
+
+          # in train.py, after train or infer
+          pos = np.array(scope.find_var(stat_pos.name).get_tensor())
+          neg = np.array(scope.find_var(stat_neg.name).get_tensor())
+          print("auc: ", paddle.fleet.auc(pos, neg))
+    """
+    fleet._role_maker._barrier_worker()
+    if scope is None:
+        scope = fluid.global_scope()
+    if isinstance(stat_pos, Variable):
+        stat_pos = np.array(scope.find_var(stat_pos.name).get_tensor())
+    elif isinstance(stat_pos, str):
+        stat_pos = np.array(scope.find_var(stat_pos).get_tensor())
+    if isinstance(stat_neg, Variable):
+        stat_neg = np.array(scope.find_var(stat_neg.name).get_tensor())
+    elif isinstance(stat_neg, str):
+        stat_neg = np.array(scope.find_var(stat_neg).get_tensor())
+    # auc pos bucket shape
+    old_pos_shape = np.array(stat_pos.shape)
+    # reshape to one dim
+    stat_pos = stat_pos.reshape(-1)
+    global_pos = np.copy(stat_pos) * 0
+    # mpi allreduce
+    fleet._role_maker._all_reduce(stat_pos, global_pos)
+    # reshape to its original shape
+    global_pos = global_pos.reshape(old_pos_shape)
+
+    # auc neg bucket
+    old_neg_shape = np.array(stat_neg.shape)
+    stat_neg = stat_neg.reshape(-1)
+    global_neg = np.copy(stat_neg) * 0
+    fleet._role_maker._all_reduce(stat_neg, global_neg)
+    global_neg = global_neg.reshape(old_neg_shape)
+
+    # calculate auc
+    num_bucket = len(global_pos[0])
+    area = 0.0
+    pos = 0.0
+    neg = 0.0
+    new_pos = 0.0
+    new_neg = 0.0
+    total_ins_num = 0
+    for i in range(num_bucket):
+        index = num_bucket - 1 - i
+        new_pos = pos + global_pos[0][index]
+        total_ins_num += global_pos[0][index]
+        new_neg = neg + global_neg[0][index]
+        total_ins_num += global_neg[0][index]
+        area += (new_neg - neg) * (pos + new_pos) / 2
+        pos = new_pos
+        neg = new_neg
+
+    auc_value = None
+    if pos * neg == 0 or total_ins_num == 0:
+        auc_value = 0.5
+    else:
+        auc_value = area / (pos * neg)
+
+    fleet._role_maker._barrier_worker()
+    return auc_value
+
+
+def mae(abserr, total_ins_num, scope=None):
+    """
+    distributed mae in fleet
+
+    Args:
+        abserr(numpy.array|Variable|string): abserr in output of fluid.contrib.layers.ctr_metric_bundle
+        total_ins_num(int|float): total train/infer instance count
+        scope(Scope): specific scope
+
+    Returns:
+        mae(float): mae value
+
+    Example:
+        .. code-block:: python
+
+          # in model.py
+          sqrerr, abserr, prob, q, pos, total = fluid.contrib.layers.ctr_metric_bundle(similarity_norm, fluid.layers.cast(x=label, dtype='float32'))
+
+          # in train.py, after train or infer
+          res = np.array(scope.find_var(abserr.name).get_tensor())
+          print("mae: ", paddle.fleet.mae(res, total_ins_num))
+    """
+    fleet._role_maker._barrier_worker()
+    if scope is None:
+        scope = fluid.global_scope()
+    if isinstance(abserr, Variable):
+        abserr = np.array(scope.find_var(abserr.name).get_tensor())
+    elif isinstance(abserr, str):
+        abserr = np.array(scope.find_var(abserr).get_tensor())
+    old_metric_shape = np.array(abserr.shape)
+    abserr = abserr.reshape(-1)
+    global_metric = np.copy(abserr) * 0
+    fleet._role_maker._all_reduce(abserr, global_metric)
+    global_metric = global_metric.reshape(old_metric_shape)
+    mae_value = global_metric[0] / total_ins_num
+    return mae_value
+
+
+def rmse(sqrerr, total_ins_num, scope=None):
+    """
+    distributed rmse in fleet
+
+    Args:
+        sqrerr(numpy.array|Variable|string): sqrerr in output of fluid.contrib.layers.ctr_metric_bundle
+        total_ins_num(int|float): total train/infer instance count
+        scope(Scope): specific scope
+
+    Returns:
+        rmse(float): rmse value
+
+    Example:
+        .. code-block:: python
+
+          # in model.py
+          sqrerr, abserr, prob, q, pos, total = fluid.contrib.layers.ctr_metric_bundle(similarity_norm, fluid.layers.cast(x=label, dtype='float32'))
+
+          # in train.py, after train or infer
+          res = np.array(scope.find_var(sqrerr.name).get_tensor())
+          print("rmse: ", paddle.fleet.rmse(res, total_ins_num))
+    """
+    fleet._role_maker._barrier_worker()
+    if scope is None:
+        scope = fluid.global_scope()
+    if isinstance(sqrerr, Variable):
+        sqrerr = np.array(scope.find_var(sqrerr.name).get_tensor())
+    elif isinstance(sqrerr, str):
+        sqrerr = np.array(scope.find_var(sqrerr).get_tensor())
+    old_metric_shape = np.array(sqrerr.shape)
+    sqrerr = sqrerr.reshape(-1)
+    global_metric = np.copy(sqrerr) * 0
+    fleet._role_maker._all_reduce(sqrerr, global_metric)
+    global_metric = global_metric.reshape(old_metric_shape)
+    rmse_value = math.sqrt(global_metric[0] / total_ins_num)
+    return rmse_value
+
+
+def mse(sqrerr, total_ins_num, scope=None):
+    """
+    distributed mse in fleet
+
+    Args:
+        sqrerr(numpy.array|Variable|string): sqrerr in output of fluid.contrib.layers.ctr_metric_bundle
+        total_ins_num(int|float): total train/infer instance count
+        scope(Scope): specific scope
+
+    Returns:
+        mse(float): mse value
+
+    Example:
+        .. code-block:: python
+
+          # in model.py
+          sqrerr, abserr, prob, q, pos, total = fluid.contrib.layers.ctr_metric_bundle(similarity_norm, fluid.layers.cast(x=label, dtype='float32'))
+
+          # in train.py, after train or infer
+          metric = np.array(scope.find_var(sqrerr.name).get_tensor())
+          print("mse: ", paddle.fleet.mse(metric, total_ins_num))
+    """
+    fleet._role_maker._barrier_worker()
+    if scope is None:
+        scope = fluid.global_scope()
+    if isinstance(sqrerr, Variable):
+        sqrerr = np.array(scope.find_var(sqrerr.name).get_tensor())
+    elif isinstance(sqrerr, str):
+        sqrerr = np.array(scope.find_var(sqrerr).get_tensor())
+    old_metric_shape = np.array(sqrerr.shape)
+    sqrerr = sqrerr.reshape(-1)
+    global_metric = np.copy(sqrerr) * 0
+    fleet._role_maker._all_reduce(sqrerr, global_metric)
+    global_metric = global_metric.reshape(old_metric_shape)
+    mse_value = global_metric[0] / total_ins_num
+    return mse_value
+
+
+def acc(correct, total, scope=None):
+    """
+    distributed accuracy in fleet
+
+    Args:
+        correct(numpy.array|Variable|string): correct Variable
+        total(numpy.array|Variable): total Variable
+        scope(Scope): specific scope
+
+    Returns:
+        acc(float): accuracy value
+
+    Example:
+        .. code-block:: python
+
+          # in model.py
+          correct = fluid.layers.create_global_var(dtype='float32', shape=[1], value=0)
+          total = fluid.layers.create_global_var(dtype='float32', shape=[1], value=0)
+          acc = fluid.layers.acc(predict, label, k=1, correct=correct, total=total)
+
+          global_correct = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
+          tmp1 = fluid.layers.elementwise_min(correct, global_correct)
+          fluid.layers.assign(tmp1, global_correct)
+
+          global_total = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
+          tmp2 = fluid.layers.elementwise_min(total, global_total)
+          fluid.layers.assign(tmp2, global_total)
+
+          # in train.py, after train or infer
+          correct_num = np.array(scope.find_var(correct.name).get_tensor())
+          total_num = np.array(scope.find_var(total.name).get_tensor())
+          print("accuracy: ", paddle.fleet.acc(correct_num, total_num))
+    """
+    fleet._role_maker._barrier_worker()
+    if scope is None:
+        scope = fluid.global_scope()
+    if isinstance(correct, Variable):
+        correct = np.array(scope.find_var(correct.name).get_tensor())
+    elif isinstance(correct, str):
+        correct = np.array(scope.find_var(correct).get_tensor())
+    if isinstance(total, Variable):
+        total = np.array(scope.find_var(total.name).get_tensor())
+    elif isinstance(total, str):
+        total = np.array(scope.find_var(total).get_tensor())
+    global_correct_num = np.copy(correct) * 0
+    global_total_num = np.copy(total) * 0
+    fleet._role_maker._all_reduce(correct, global_correct_num)
+    fleet._role_maker._all_reduce(total, global_total_num)
+    return float(global_correct_num[0]) / float(global_total_num[0])
diff --git a/python/paddle/fleet/parameter_server/__init__.py b/python/paddle/fleet/runtime/__init__.py
similarity index 78%
rename from python/paddle/fleet/parameter_server/__init__.py
rename to python/paddle/fleet/runtime/__init__.py
index 847ddc47ac891..f38287cf51a72 100644
--- a/python/paddle/fleet/parameter_server/__init__.py
+++ b/python/paddle/fleet/runtime/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,3 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .collective_runtime import CollectiveRuntime
+
+__all__ = ["CollectiveRuntime"]
diff --git a/python/paddle/fleet/runtime/collective_runtime.py b/python/paddle/fleet/runtime/collective_runtime.py
new file mode 100644
index 0000000000000..0881c4b52c822
--- /dev/null
+++ b/python/paddle/fleet/runtime/collective_runtime.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .runtime_base import RuntimeBase
+import logging
+
+
+class CollectiveRuntime(RuntimeBase):
+    def __init__(self):
+        super(CollectiveRuntime, self).__init__()
+
+    def _init_worker(self):
+        logging.warn(
+            "You should not call 'init_worker' method for collective mode.")
+        pass
+
+    def _run_worker(self):
+        logging.warn(
+            "You should not call 'run_worker' method for collective mode.")
+        pass
+
+    def _init_server(self):
+        logging.warn(
+            "You should not call 'init_server' method for collective mode.")
+        pass
+
+    def _run_server(self):
+        logging.warn(
+            "You should not call 'run_server' method for collective mode.")
+        pass
+
+    def _stop_worker(self):
+        logging.warn(
+            "You should not call 'stop_worker' method for collective mode.")
+        pass
+
+    # save inference model should be added here
diff --git a/python/paddle/fleet/runtime/runtime_base.py b/python/paddle/fleet/runtime/runtime_base.py
new file mode 100644
index 0000000000000..5610a5305a464
--- /dev/null
+++ b/python/paddle/fleet/runtime/runtime_base.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+
+
+class RuntimeBase(object):
+    def __init__(self):
+        pass
+
+    def _set_basic_info(self, loss, role_maker, optimizer, strategy):
+        self.loss = loss
+        self.role_maker = role_maker
+        self.optimizer = optimizer
+        self.strategy = strategy
+
+    def _run_worker(self):
+        pass
+
+    def _init_server(self):
+        pass
+
+    def _run_server(self):
+        pass
+
+    def _stop_worker(self):
+        pass
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 273a669a1414e..50e6eaa80c135 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -35,7 +35,7 @@
     'match_matrix_tensor', 'tree_conv', 'fused_embedding_seq_pool',
     'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
     'partial_sum', 'tdm_child', 'rank_attention', 'tdm_sampler', 'batch_fc',
-    '_pull_box_extended_sparse'
+    '_pull_box_extended_sparse', 'bilateral_slice'
 ]
 
 
@@ -1409,3 +1409,65 @@ def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
     if len(outs) == 1:
         return outs[0], outs_extend[0]
     return outs, outs_extend
+
+
+def bilateral_slice(x, guide, grid, has_offset, name=None):
+    """
+    :alias_main: paddle.nn.functional.bilateral_slice
+	:alias: paddle.nn.functional.bilateral_slice,paddle.nn.functional.vision.bilateral_slice
+	:old_api: paddle.fluid.layers.bilateral_slice
+
+    This operation implements bilateral slicing on the input according to the guide map.
+    For more information of bilateral slicing, please refer to Deep Bilateral Learning for Real-Time Image Enhancement <https://groups.csail.mit.edu/graphics/hdrnet/data/hdrnet.pdf>_
+
+    Args:
+        x(Variable): The input tensor, which is a 4-D tensor with shape
+                     [N, C, H, W], N is the batch size, C is the channel
+                     number, H and W is the feature height and width.
+                     The data type is float32 and float64.
+        guide(Variable): Input grid tensor of shape [N, H, W]. The
+                        data type is float32 and float64.
+        grid(Variable): Input grid tensor of shape [N, C, D, H, W]. The
+                        data type is float32 and float64.
+        has_offset(bool): Whether to slice with affine offset.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Variable: Output of shape [N, C, H, W]. The data type is same as input tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            x = fluid.data(name='x', shape=[None, 3, 101, 60], dtype='float32')
+            guide = fluid.data(name='guide', shape=[None, 101, 60], dtype='float32')
+            grid = fluid.data(name='grid', shape=[None, 12, 8, 10, 6], dtype='float32')
+
+            # without offset
+            output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=False)
+            
+            # has offset
+            output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=True)
+
+    """
+    helper = LayerHelper("bilateral_slice", **locals())
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'bilateral_slice')
+    check_variable_and_dtype(guide, 'guide', ['float32', 'float64'],
+                             'bilateral_slice')
+    check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
+                             'bilateral_slice')
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    inputs = {'X': x, 'Guide': guide, 'Grid': grid}
+
+    helper.append_op(
+        type='bilateral_slice',
+        inputs=inputs,
+        attrs={'has_offset': has_offset},
+        outputs={'Out': out})
+    return out
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index d7d0c68a3145f..d87363abf14cd 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -169,13 +169,13 @@ def run_shell_command(cmd):
     if err:
         return None
     else:
-        return out.decode('utf-8')
+        return out.decode('utf-8').strip()
 
 
 def get_dso_path(core_so, dso_name):
     if core_so and dso_name:
         return run_shell_command("ldd %s|grep %s|awk '{print $3}'" %
-                                 (core_so, dso_name)).strip()
+                                 (core_so, dso_name))
     else:
         return None
 
@@ -225,7 +225,11 @@ def to_list(s):
 # The final solution is to upgrade glibc to > 2.22 on the target system.
 if platform.system().lower() == 'linux' and less_than_ver(get_glibc_ver(),
                                                           '2.23'):
-    pre_load('libgomp')
+    try:
+        pre_load('libgomp')
+    except Exception as e:
+        # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
+        sys.stderr.write('Error: Can not preload libgomp.so')
 
 load_noavx = False
 
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index e020507af418b..d359910167d63 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -94,7 +94,9 @@ def save_dygraph(state_dict, model_path):
         pickle.dump(model_dict, f, protocol=2)
 
 
-@dygraph_only
+# TODO(qingqing01): remove dygraph_only to support loading static model.
+# maybe need to unify the loading interface after 2.0 API is ready.
+#@dygraph_only
 def load_dygraph(model_path, keep_name_table=False):
     '''
     :api_attr: imperative
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
index 61ff82f5be860..73dba66d3fca4 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
@@ -17,12 +17,12 @@
 import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 
 
 class AssertTransformer(gast.NodeTransformer):
     """
-    A class transforms python assert to fluid.layers.Assert.
+    A class transforms python assert to convert_assert.
     """
 
     def __init__(self, wrapper_root):
@@ -32,21 +32,15 @@ def __init__(self, wrapper_root):
         self.wrapper_root = wrapper_root
         self.root = wrapper_root.node
 
-        self.static_analysis_visitor = StaticAnalysisVisitor(self.root)
-
     def transform(self):
         self.visit(self.root)
 
     def visit_Assert(self, node):
-        if not self.static_analysis_visitor.is_tensor_node(node.test):
-            return node
-        cast_node = gast.Call(
-            func=gast.parse("fluid.layers.cast").body[0].value,
-            args=[node.test, gast.Constant(
-                value="bool", kind=None)],
-            keywords=[])
-        assert_node = gast.Call(
-            func=gast.parse("fluid.layers.Assert").body[0].value,
-            args=[cast_node],
-            keywords=[])
-        return gast.Expr(value=assert_node)
+        convert_assert_node = gast.parse(
+            'fluid.dygraph.dygraph_to_static.convert_operators.convert_assert({test}, {msg})'.
+            format(
+                test=ast_to_source_code(node.test),
+                msg=ast_to_source_code(node.msg)
+                if node.msg else "")).body[0].value
+
+        return gast.Expr(value=convert_assert_node)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 78031a5b38833..02d8754e62c6d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -14,7 +14,8 @@
 
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable
-from paddle.fluid.framework import Variable, core
+from paddle.fluid.framework import core, Variable
+from paddle.fluid.layers import Assert, Print
 from paddle.fluid.layers import cast, control_flow, logical_and, logical_not, logical_or, nn
 
 
@@ -259,3 +260,28 @@ def convert_var_dtype(var, dtype):
         return cast(var, dtype=cast_map[dtype])
     else:
         return eval('{}(var)'.format(dtype))
+
+
+def convert_assert(cond, message=""):
+    """
+    A function representation of a Python ``assert`` statement.
+    """
+    if isinstance(cond, Variable):
+        cond = cast(cond, "bool")
+        # NOTE: message is not used because Paddle Assert has no corresponding parameter to use.
+        return Assert(cond)
+    else:
+        assert cond, message
+
+
+def convert_print(*args):
+    """
+    A function representing Python ``print`` statement. Note: this is a basic
+    python function so we haven't handle sep, end, file and flush parameters of
+    python function.
+    """
+    for var in args:
+        if isinstance(var, Variable):
+            var = Print(var)
+        else:
+            print(var)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index af10c65400ee2..0a9e66a5bb0b1 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -112,14 +112,7 @@ def __init__(self, main_program, inputs, outputs, parameters=None):
         self._outputs = NestSequence(outputs, need_check=True)
         self._params = parameters if parameters is not None else []
 
-        # Check all params from main program can be found in self._params:
-        # 1. parameter in self._params should be type `framework.ParamBase` which are created in dygraph.
-        # 2. parameter from transformed program shall be found in self._params.
-        #    Because they share same data with ParamBase of original dygraph.
-        self._check_params_all_inited(main_program)
-        self._prune_unused_params(main_program)
-
-        self._infer_program = main_program
+        self._infer_program = self._verify_program(main_program)
         self._train_program = self._append_backward_desc()
         # Switch infer or train by train() and eval()
         self._trace_program = None
@@ -128,6 +121,20 @@ def __init__(self, main_program, inputs, outputs, parameters=None):
         # Set default mode to train
         self.train()
 
+    def _verify_program(self, main_program):
+        """
+        Verify that the program parameter is initialized, prune some unused params,
+        and remove redundant op callstack.
+        """
+        # 1. Check all params from main program can be found in self._params
+        self._check_params_all_inited(main_program)
+        # 2. Prune the parameters not used anywhere in the program.
+        self._prune_unused_params(main_program)
+        # 3. Remove op's python call stack with redundant low-level error messages.
+        main_program = self._remove_op_call_stack(main_program)
+
+        return main_program
+
     @switch_to_static_graph
     def _append_backward_desc(self):
         program = self._infer_program.clone()
@@ -295,6 +302,19 @@ def _set_grad_type(self, params):
                 continue
             param._set_grad_type(grad_var.type())
 
+    def _remove_op_call_stack(self, main_program):
+        """
+        Remove op's python call stack with redundant low-level error messages related to
+        transforamtions to avoid confusing users.
+        """
+        assert isinstance(main_program, framework.Program)
+        for block in main_program.blocks:
+            for op in block.ops:
+                if op.has_attr("op_callstack"):
+                    op._remove_attr("op_callstack")
+
+        return main_program
+
     def _check_params_all_inited(self, main_program):
         """
         Check all params from main program are already initialized, see details as follows:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index e55018d2e7df9..1b6b64ae1fdee 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -47,84 +47,17 @@ def transform(self):
     # NOTE: deal with print in PY3
     def visit_Call(self, node):
         if isinstance(node.func, gast.Name) and node.func.id == 'print':
-            parent_node = self.node_to_wrapper_map[node].parent.node
-            if isinstance(parent_node, gast.Expr):
-                # NOTE: why need transform to gast.Assign node
-                # only fluid.layers.Print(x) will be pruned when exe.run(use_prune=True)
-                print_assign_node = self._create_assign_node(node)
-                if print_assign_node is not None:
-                    return print_assign_node
-            else:
-                return self._transform_call_node(node)
+            convert_print_node = self._create_print_node(node.args)
+            return gast.Expr(value=convert_print_node)
         return node
 
     # NOTE: deal with print in PY2
     def visit_Print(self, node):
-        print_assign_node = self._create_assign_node(node)
-        if print_assign_node is not None:
-            return print_assign_node
-        return node
-
-    def _transform_call_node(self, node):
-        assert isinstance(node, gast.Call), "visit Node is not gast.Call node."
-        var_node = self._get_print_var_node(node)
-        if var_node is None:
-            return node
-        if self._need_transform(var_node, node):
-            return self._build_print_call_node(var_node)
-        return node
-
-    def _create_assign_node(self, node):
-        var_node = self._get_print_var_node(node)
-        if var_node is None:
-            return None
-        if self._need_transform(var_node, node):
-            return gast.Assign(
-                targets=[var_node], value=self._build_print_call_node(var_node))
-        return None
-
-    def _build_print_call_node(self, node):
-        return gast.Call(
-            func=gast.parse('fluid.layers.Print').body[0].value,
-            args=[node],
-            keywords=[
-                gast.keyword(
-                    arg='summarize',
-                    value=gast.UnaryOp(
-                        op=gast.USub(),
-                        operand=gast.Constant(
-                            value=1, kind=None))), gast.keyword(
-                                arg='print_phase',
-                                value=gast.Constant(
-                                    value='forward', kind=None))
-            ])
-
-    def _get_print_var_node(self, node):
-        if isinstance(node, gast.Call):
-            var_list = node.args
-        elif isinstance(node, gast.Print):
-            var_list = node.values
-            if isinstance(var_list[0], gast.Tuple):
-                var_list = var_list[0].elts
-        # TODO: support print multiple Var
-        if len(var_list) == 1:
-            return var_list[0]
-        else:
-            _logger.warning(
-                "ProgramTranslator could not transform printing multiple values like < %s > now and will run it as-is."
-                % ast_to_source_code(node).strip())
-        return None
-
-    def _need_transform(self, var_node, print_node):
-        if isinstance(var_node, gast.Name):
-            if self.static_analysis_visitor.is_tensor_node(var_node):
-                return True
-            else:
-                _logger.warning(
-                    "ProgramTranslator could not transform printing value that are not Tensor like < %s > now and will run it as-is."
-                    % ast_to_source_code(print_node).strip())
-        else:
-            _logger.warning(
-                "ProgramTranslator could not transform < %s > now and will run it as-is."
-                % ast_to_source_code(print_node).strip())
-        return False
+        convert_print_node = self._create_print_node(node.values)
+        return gast.Expr(value=convert_print_node)
+
+    def _create_print_node(self, print_args):
+        convert_print_func = gast.parse(
+            'fluid.dygraph.dygraph_to_static.convert_operators.convert_print'
+        ).body[0].value
+        return gast.Call(func=convert_print_func, args=print_args, keywords=[])
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index bd468b55d812e..754a0b67fed92 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -653,8 +653,9 @@ def forward(self, x):
     """
 
     def get_inout_spec(all_vars, target_vars, return_name=False):
-        valid_vars = [var for var in all_vars if isinstance(var, Variable)]
+        result_list = []
         valid_var_dict = {}
+        valid_vars = [var for var in all_vars if isinstance(var, Variable)]
         for var in valid_vars:
             valid_var_dict[var.name] = var
         if target_vars:
@@ -663,13 +664,13 @@ def get_inout_spec(all_vars, target_vars, return_name=False):
                 if var.name not in valid_var_dict:
                     raise RuntimeError(
                         "The variable to feed/fetch are not exist.")
-                target_vars[i] = valid_var_dict[var.name]
+                result_list.append(valid_var_dict[var.name])
         else:
-            target_vars = valid_vars
+            result_list = valid_vars
         if return_name:
-            target_vars = [var.name for var in target_vars]
+            result_list = [var.name for var in target_vars]
 
-        return target_vars
+        return result_list
 
     # 1. input check
     prog_translator = ProgramTranslator()
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index bba4eb071a4db..5673867717260 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -16,9 +16,12 @@
 import contextlib
 import sys
 import numpy as np
-import collections
 import six
 import re
+import copy
+import weakref
+import warnings
+
 from . import parallel_helper
 from .. import unique_name
 from paddle.fluid import core
@@ -26,9 +29,6 @@
 from .base import program_desc_tracing_guard, param_guard
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
-import copy
-import weakref
-import warnings
 
 __all__ = ['Layer']
 
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 2fcd0fe1e5a6d..cce383be7e22c 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -67,6 +67,8 @@ def create_lr_var(self, lr):
             persistable=False)
         return lr
 
+    # Note: If you want to change what optimizer.state_dict stores, just overwrite this functions, 
+    # "self.step_num" will be stored by default.
     def state_dict(self):
         """
         Returns the state of the scheduler as a :class:`dict`.
@@ -859,6 +861,7 @@ def __init__(self,
         self.num_bad_epochs = 0
         self.epoch_num = 0
 
+    # "cooldown_counter / best_loss / num_bad_epochs / epoch_num / learning_rate" will be stored.
     def _state_keys(self):
         self.keys = [
             'cooldown_counter', 'best_loss', 'num_bad_epochs', 'epoch_num',
@@ -961,6 +964,8 @@ def __init__(self, learning_rate, dtype=None):
 
         self.epoch()
 
+    # For those subclass who overload _LearningRateEpochDecay, "self.epoch_num/learning_rate" will be stored by default.
+    # you can change it for your subclass.
     def _state_keys(self):
         self.keys = ['epoch_num', 'learning_rate']
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 2e41a8ff417b3..d509fcc38e771 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -124,7 +124,7 @@ def set_value(self, value):
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, backward_strategy=None):
+    def backward(self, backward_strategy=None, retain_graph=False):
         """
         **Notes**:
             **This API is ONLY available in Dygraph mode**
@@ -133,6 +133,10 @@ def backward(self, backward_strategy=None):
 
         Args:
             backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward
+            retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
+            like to add more ops to the built graph after calling this method(`backward`), set the parameter
+            `retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+            Defaults to False.
 
         Returns:
             NoneType: None
@@ -164,7 +168,8 @@ def backward(self, backward_strategy=None):
                 backward_strategy = BackwardStrategy()
                 backward_strategy.sort_sum_gradient = False
 
-            self._run_backward(backward_strategy, framework._dygraph_tracer())
+            self._run_backward(backward_strategy,
+                               framework._dygraph_tracer(), retain_graph)
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index f6cca91374e58..23ab436c0634b 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1079,9 +1079,6 @@ def run(self,
                 use_prune=use_prune,
                 return_merged=return_merged)
         except Exception as e:
-            if not isinstance(e, core.EOFException):
-                warnings.warn(
-                    "The following exception is not an EOF exception.")
             six.reraise(*sys.exc_info())
 
     def _run_impl(self, program, feed, fetch_list, feed_var_name,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index cf4f47d13fc9f..393ee0682d4b9 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -66,7 +66,6 @@
 _dygraph_tracer_ = None
 _dygraph_current_expected_place_ = None
 _current_device = None
-
 global_prog_seed = 0
 
 
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index fadd247e0df37..260033f9ef010 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1064,6 +1064,13 @@ def prepend_feed_ops(inference_program,
         persistable=True)
 
     for i, name in enumerate(feed_target_names):
+        if not global_block.has_var(name):
+            raise ValueError(
+                "The feeded_var_names[{i}]: '{name}' doesn't exist in pruned inference program. "
+                "Please check whether '{name}' is a valid feed_var name, or remove it from feeded_var_names "
+                "if '{name}' is not involved in the target_vars calculation.".
+                format(
+                    i=i, name=name))
         out = global_block.var(name)
         global_block._prepend_op(
             type='feed',
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index a84276cc6a173..5c14d26f3fe24 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -286,27 +286,14 @@ def func(x, name=None):
     .. code-block:: python
 
         import paddle
-        import paddle.fluid as fluid
         import numpy as np
 
-        inputs = fluid.data(name="x", shape = [None, 4], dtype='float32')
-        output = paddle.%s(inputs)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-
-        #input.shape=1X4, batch_size=1
-        img = np.array([[1.0, 2.0, 3.0, 4.0]]).astype(np.float32)
-        res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
-        print(res)
-
-        # using dygraph
-        with paddle.imperative.guard():
-            dygraph_input = paddle.imperative.to_variable(img)
-            dygraph_output = paddle.%s(dygraph_input)
-            print(dygraph_output.numpy())
-""" % (op_type, op_type)
-
+        paddle.enable_imperative()
+        x_data = np.array([1, 2, 3, 4]).astype(np.float32)
+        x = paddle.imperative.to_variable(x_data)
+        res = paddle.%s(x)
+        print(res.numpy())
+""" % op_type
     return func
 
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5a5ec20b2ce3b..a894dbd005707 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6203,11 +6203,15 @@ def squeeze(input, axes, name=None):
             y = layers.squeeze(input=x, axes=[2]) # y.shape=[None, 5, 10]
 
     """
+    if in_dygraph_mode():
+        out, _ = core.ops.squeeze2(input, 'axes', axes)
+        return out
+
     helper = LayerHelper("squeeze", **locals())
     check_variable_and_dtype(
         input, 'input',
         ['float16', 'float32', 'float64', 'int8', 'int32', 'int64'], 'squeeze')
-    check_type(axes, 'axes', list, 'squeeze')
+    check_type(axes, 'axes', (list, tuple), 'squeeze')
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
@@ -11101,7 +11105,7 @@ def shape(input):
                 input.shape = [3, 2]
 
     Args:
-        input (Variable): The input can be N-D Tensor or SelectedRows with data type float32, float64, int32, int64.
+        input (Variable): The input can be N-D Tensor or SelectedRows with data type float16, float32, float64, int32, int64.
                           If input variable is type of SelectedRows, returns the shape of it's inner tensor.
 
     Returns:
@@ -11124,8 +11128,9 @@ def shape(input):
             res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
             print(res) # [array([  3, 100, 100], dtype=int32)]
     """
-    check_variable_and_dtype(input, 'input',
-                             ['float32', 'float64', 'int32', 'int64'], 'shape')
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'shape')
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
@@ -12006,23 +12011,21 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
 def logical_and(x, y, out=None, name=None):
     """
     :alias_main: paddle.logical_and
-	:alias: paddle.logical_and,paddle.tensor.logical_and,paddle.tensor.logic.logical_and
-	:old_api: paddle.fluid.layers.logical_and
+    :alias: paddle.logical_and, paddle.tensor.logical_and, paddle.tensor.logic.logical_and
+    :old_api: paddle.fluid.layers.logical_and
 
-    logical_and Operator
-
-    It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
-    Each element of Out is calculated by
+    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    Each element of ``out`` is calculated by
 
     .. math::
 
-        Out = X \land Y
+        out = x \&\& y
 
     Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        out(LoDTensor or Tensor): The LoDTensor or Tensor that specifies the output of the operator, which can be any Variable that has been created in the program. The default value is None, and a new Variable will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
+        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         ${out_type}: ${out_comment}
@@ -12030,25 +12033,16 @@ def logical_and(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            # Graph organizing
-            x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-            y = fluid.layers.data(name='y', shape=[2], dtype='bool')
-            res = fluid.layers.logical_and(x=x, y=y)
-            # The comment lists another available method.
-            # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
-            # fluid.layers.logical_and(x=x, y=y, out=res)
-
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
-
-            # Execute
-            x_i = np.array([[1, 0], [0, 1]]).astype(np.bool)
-            y_i = np.array([[1, 1], [0, 0]]).astype(np.bool)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i, 'y':y_i}, fetch_list=[res])
-            print(res_val) # [[True, False], [False, False]]
+            paddle.enable_imperative()
+            x_data = np.array([True, True, False, False], dtype=np.bool)
+            y_data = np.array([True, False, True, False], dtype=np.bool)
+            x = paddle.imperative.to_variable(x_data)
+            y = paddle.imperative.to_variable(y_data)
+            res = paddle.logical_and(x, y)
+            print(res.numpy()) # [True False False False]
     """
 
     return _logical_op(
@@ -12059,23 +12053,21 @@ def logical_and(x, y, out=None, name=None):
 def logical_or(x, y, out=None, name=None):
     """
     :alias_main: paddle.logical_or
-	:alias: paddle.logical_or,paddle.tensor.logical_or,paddle.tensor.logic.logical_or
-	:old_api: paddle.fluid.layers.logical_or
-
-    logical_or Operator
+    :alias: paddle.logical_or, paddle.tensor.logical_or, paddle.tensor.logic.logical_or
+    :old_api: paddle.fluid.layers.logical_or
 
-    It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
-    Each element of Out is calculated by
+    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    Each element of ``out`` is calculated by
 
     .. math::
 
-        Out = X \lor Y
+        out = x || y
 
     Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        out(LoDTensor or Tensor): The LoDTensor or Tensor that specifies the output of the operator, which can be any Variable that has been created in the program. The default value is None, and a new Variable will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
+        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         ${out_type}: ${out_comment}
@@ -12083,25 +12075,16 @@ def logical_or(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            # Graph organizing
-            x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-            y = fluid.layers.data(name='y', shape=[2], dtype='bool')
-            res = fluid.layers.logical_or(x=x, y=y)
-            # The comment lists another available method.
-            # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
-            # fluid.layers.logical_or(x=x, y=y, out=res)
-
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
-
-            # Execute
-            x_i = np.array([[1, 0], [0, 1]]).astype(np.bool)
-            y_i = np.array([[1, 1], [0, 0]]).astype(np.bool)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i, 'y':y_i}, fetch_list=[res])
-            print(res_val) # [[True, True], [False, True]]
+            paddle.enable_imperative()
+            x_data = np.array([True, True, False, False], dtype=np.bool)
+            y_data = np.array([True, False, True, False], dtype=np.bool)
+            x = paddle.imperative.to_variable(x_data)
+            y = paddle.imperative.to_variable(y_data)
+            res = paddle.logical_or(x, y)
+            print(res.numpy()) # [True  True  True False]
     """
 
     return _logical_op(
@@ -12112,23 +12095,21 @@ def logical_or(x, y, out=None, name=None):
 def logical_xor(x, y, out=None, name=None):
     """
     :alias_main: paddle.logical_xor
-	:alias: paddle.logical_xor,paddle.tensor.logical_xor,paddle.tensor.logic.logical_xor
-	:old_api: paddle.fluid.layers.logical_xor
-
-    logical_xor Operator
+    :alias: paddle.logical_xor, paddle.tensor.logical_xor, paddle.tensor.logic.logical_xor
+    :old_api: paddle.fluid.layers.logical_xor
 
-    It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
-    Each element of Out is calculated by
+    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    Each element of ``out`` is calculated by
 
     .. math::
 
-        Out = (X \lor Y) \land \lnot (X \land Y)
+        out = (x || y) \&\& !(x \&\& y)
 
     Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        out(LoDTensor or Tensor): The LoDTensor or Tensor that specifies the output of the operator, which can be any Variable that has been created in the program. The default value is None, and a new Variable will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
+        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         ${out_type}: ${out_comment}
@@ -12136,25 +12117,16 @@ def logical_xor(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            # Graph organizing
-            x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-            y = fluid.layers.data(name='y', shape=[2], dtype='bool')
-            res = fluid.layers.logical_xor(x=x, y=y)
-            # The comment lists another available method.
-            # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
-            # fluid.layers.logical_xor(x=x, y=y, out=res)
-
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
-
-            # Execute
-            x_i = np.array([[1, 0], [0, 1]]).astype(np.bool)
-            y_i = np.array([[1, 1], [0, 0]]).astype(np.bool)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i, 'y':y_i}, fetch_list=[res])
-            print(res_val) # [[False, True], [False, True]]
+            paddle.enable_imperative()
+            x_data = np.array([True, True, False, False], dtype=np.bool)
+            y_data = np.array([True, False, True, False], dtype=np.bool)
+            x = paddle.imperative.to_variable(x_data)
+            y = paddle.imperative.to_variable(y_data)
+            res = paddle.logical_xor(x, y)
+            print(res.numpy()) # [False  True  True False]
     """
 
     return _logical_op(
@@ -12165,46 +12137,34 @@ def logical_xor(x, y, out=None, name=None):
 def logical_not(x, out=None, name=None):
     """
     :alias_main: paddle.logical_not
-	:alias: paddle.logical_not,paddle.tensor.logical_not,paddle.tensor.logic.logical_not
-	:old_api: paddle.fluid.layers.logical_not
+    :alias: paddle.logical_not, paddle.tensor.logical_not, paddle.tensor.logic.logical_not
+    :old_api: paddle.fluid.layers.logical_not
 
-    logical_not Operator
-
-    It operates element-wise on X, and returns the Out. X and Out are N-dim boolean LoDTensor or Tensor.
-    Each element of Out is calculated by
+    ``logical_not`` operator computes element-wise logical NOT on ``x``, and returns ``out``. ``x`` and ``out`` are N-dim boolean ``Variable``.
+    Each element of ``out`` is calculated by
 
     .. math::
 
-        Out = \lnot X
+        out = !x
 
     Args:
-        x(${x_type}): ${x_comment}
-        out(LoDTensor/Tensor): The LoDTensor/Tensor that specifies the output of the operator, which can be any Variable that has been created in the program. The default value is None, and a new Variable will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        x(${x_type}): ${x_comment}.
+        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable` will be created to save the output.
+        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         ${out_type}: ${out_comment}
 
     Examples:
         .. code-block:: python
-
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            # Graph organizing
-            x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-            res = fluid.layers.logical_not(x)
-            # The comment lists another avaliable method.
-            # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
-            # fluid.layers.logical_not(x, out=res)
-
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
-
-            # Execute
-            x_i = np.array([[1, 0]]).astype(np.bool)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i}, fetch_list=[res])
-            print(res_val) # [[False, True]]
+            paddle.enable_imperative()
+            x_data = np.array([True, False, True, False], dtype=np.bool)
+            x = paddle.imperative.to_variable(x_data)
+            res = paddle.logical_not(x)
+            print(res.numpy()) # [False  True False  True]
     """
 
     return _logical_op(
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index c4b6da5629ae4..3adb243c8f83d 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -35,6 +35,8 @@
     'acos',
     'asin',
     'sin',
+    'sinh',
+    'cosh',
     'round',
     'reciprocal',
     'square',
@@ -80,9 +82,9 @@ def softshrink(x, alpha=None):
 
 
 softshrink.__doc__ = """
-	:alias_main: paddle.nn.functional.softshrink
-	:alias: paddle.nn.functional.softshrink,paddle.nn.functional.activation.softshrink
-	:old_api: paddle.fluid.layers.softshrink
+	:alias_main: paddle.nn.functional.softshrink
+	:alias: paddle.nn.functional.softshrink,paddle.nn.functional.activation.softshrink
+	:old_api: paddle.fluid.layers.softshrink
 
 :strong:`Softshrink Activation Operator`
 
@@ -127,9 +129,9 @@ def hard_shrink(x, threshold=None):
 
 
 hard_shrink.__doc__ = _hard_shrink_.__doc__ + """
-	:alias_main: paddle.nn.functional.hard_shrink
-	:alias: paddle.nn.functional.hard_shrink,paddle.nn.functional.activation.hard_shrink
-	:old_api: paddle.fluid.layers.hard_shrink
+	:alias_main: paddle.nn.functional.hard_shrink
+	:alias: paddle.nn.functional.hard_shrink,paddle.nn.functional.activation.hard_shrink
+	:old_api: paddle.fluid.layers.hard_shrink
 
 Examples:
 
@@ -154,9 +156,9 @@ def cumsum(x, axis=None, exclusive=None, reverse=None):
 
 
 cumsum.__doc__ = """
-	:alias_main: paddle.cumsum
-	:alias: paddle.cumsum,paddle.tensor.cumsum,paddle.tensor.math.cumsum
-	:old_api: paddle.fluid.layers.cumsum
+	:alias_main: paddle.cumsum
+	:alias: paddle.cumsum,paddle.tensor.cumsum,paddle.tensor.math.cumsum
+	:old_api: paddle.fluid.layers.cumsum
 
 The cumulative sum of the elements along a given axis. By default, the first element of the result is the same of the first element of the input. If exlusive is true, the first element of the result is 0.
 
@@ -196,9 +198,9 @@ def thresholded_relu(x, threshold=None):
 
 
 thresholded_relu.__doc__ = """
-	:alias_main: paddle.nn.functional.thresholded_relu
-	:alias: paddle.nn.functional.thresholded_relu,paddle.nn.functional.activation.thresholded_relu
-	:old_api: paddle.fluid.layers.thresholded_relu
+	:alias_main: paddle.nn.functional.thresholded_relu
+	:alias: paddle.nn.functional.thresholded_relu,paddle.nn.functional.activation.thresholded_relu
+	:old_api: paddle.fluid.layers.thresholded_relu
 
 :strong:`Thresholded ReLU Activation Operator`
 
@@ -282,9 +284,9 @@ def gelu(x, approximate=False):
 
 
 gelu.__doc__ = """
-	:alias_main: paddle.nn.functional.gelu
-	:alias: paddle.nn.functional.gelu,paddle.nn.functional.activation.gelu
-	:old_api: paddle.fluid.layers.gelu
+	:alias_main: paddle.nn.functional.gelu
+	:alias: paddle.nn.functional.gelu,paddle.nn.functional.activation.gelu
+	:old_api: paddle.fluid.layers.gelu
 
 :strong:`GeLU Activation Operator`
 For more details, see [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415).
@@ -370,9 +372,9 @@ def erf(x):
 
 
 erf.__doc__ = """
-	:alias_main: paddle.erf
-	:alias: paddle.erf,paddle.tensor.erf,paddle.tensor.math.erf,paddle.nn.functional.erf,paddle.nn.functional.activation.erf
-	:old_api: paddle.fluid.layers.erf
+	:alias_main: paddle.erf
+	:alias: paddle.erf,paddle.tensor.erf,paddle.tensor.math.erf,paddle.nn.functional.erf,paddle.nn.functional.activation.erf
+	:old_api: paddle.fluid.layers.erf
 
 :strong:`Erf Operator`
 For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 969e85a742519..2d9ece63d0c1a 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -346,7 +346,8 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
-    check_variable_and_dtype(input, 'input', ['float32'], 'sequence_pool')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'sequence_pool')
     helper = LayerHelper('sequence_pool', **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index dedee1fdfd403..99cf77aed2609 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -650,9 +650,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     Returns:
         Variable: Tensor which is created according to shape and dtype.
 
-    Raise:
+    Raises:
         TypeError: The dtype must be one of bool, float16, float32, float64, int32 and int64
         and the data type of out Tensor must be the same as the dtype. 
+        TypeError: The shape must be one of list, tuple and Variable.
 
     Examples:
         .. code-block:: python
@@ -665,7 +666,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
 
           # attr shape is a list which contains Variable Tensor.
           positive_2 = fluid.layers.fill_constant([1], "int32", 2)
-          data3 = fluid.layers.fill_constant(shape=[1, positive_2], dtype='float32', value=1.5) # data3=[1.5, 1.5]
+          data3 = fluid.layers.fill_constant(shape=[1, positive_2], dtype='float32', value=1.5) # data3=[[1.5, 1.5]]
 
           # attr shape is an Variable Tensor.
           shape = fluid.layers.fill_constant([2], "int32", 2) # shape=[2,2]
@@ -1322,25 +1323,35 @@ def isfinite(x):
     return out
 
 
-def range(start, end, step, dtype):
+def range(start, end, step, dtype, name=None):
     """
     Return evenly spaced values within a given interval.
 
-    Values are generated within the half-open interval [start, stop) (in other words,
-    the interval including start but excluding stop).
+    Values are generated within the half-open interval [start, stop) (in other
+    words, the interval including start but excluding stop).
+
+    If dtype is float32 or float64, we advise adding a small epsilon to end to
+    avoid floating point rounding errors when comparing against end.
 
     Parameters:
-        start(float32 | float64 | int32 | int64 | Variable): Start of interval. The interval includes this value.
-            when start is Variable, it is a 1-D Tensor with shape [1].
-        end(float32 | float64 | int32 | int64 | Variable): End of interval. The interval does not include this
-                                 value, except in some cases where step is not an integer
-                                 and floating point round-off affects the length of out. When end is Variable,
-                                 it is a 1-D Tensor with shape [1].
-        step(float32 | float64 | int32 | int64 | Variable): Spacing between values. For any output out, this is the
-                                  distance between two adjacent values, out[i+1] - out[i].
-        dtype(str|core.VarDesc.VarType): the data type of the output tensor, can be float32, float64, int32, int64.
-
-    Returns: a 1-D Tensor which is evenly spaced values within a given interval. Its data type is set by dtype.
+        start(float|int|Variable): Start of interval. The interval includes
+            this value. If start is Variable, it is a 1-D Tensor with shape [1],
+            and it's data type should be one of int32, int64, float32, float64.
+        end(float|int|Variable): End of interval. The interval does not include
+            this value. When end is Variable, it is a 1-D Tensor with shape [1],
+            and it's data type should be int32, int64, float32, float64.
+        step(float|int|Variable): Spacing between values. For any out, this is
+            the istance between two adjacent values, out[i+1] - out[i].
+            When end is Variable, it is a 1-D Tensor with shape [1], and it's
+            data type should be one of int32, int64, float32, float64.
+        dtype(str|np.dtype|core.VarDesc.VarType): The data type of the output
+            tensor, can be float32, float64, int32, int64.
+        name(str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+            Default is None.
+
+    Returns: a 1-D Tensor which is evenly spaced values within a given interval.
+        Its data type is set by dtype.
     
     Return type: Variable
 
@@ -1348,44 +1359,47 @@ def range(start, end, step, dtype):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             data = fluid.layers.range(0, 10, 2, 'int32')
+            import paddle.fluid as fluid
 
-    """
-    check_type(start, 'start', (float, int, Variable), 'range')
-    check_type(end, 'end', (float, int, Variable), 'range')
-    check_type(step, 'step', (float, int, Variable), 'range')
-    helper = LayerHelper("range", **locals())
+            out1 = fluid.layers.range(0, 10, 2, 'int32')
+            # [0, 2, 4, 6, 8]
 
-    check_dtype(dtype, 'create data type',
-                ['float32', 'float64', 'int32', 'int64'], 'range')
+            start_var = fluid.layers.fill_constant([1], 'int64', 3)
+            out2 = fluid.layers.range(start_var, 7, 1, 'int64')
+            # [3, 4, 5, 6]
+
+    """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
 
-    dtype = convert_dtype(dtype)
     if not isinstance(start, Variable):
         start = fill_constant([1], dtype, start)
-    elif convert_dtype(start.dtype) != dtype:
-        # make sure that start, end, step has the same dtype as
-        # `dtype`
-        start = cast(x=start, dtype=dtype)
+    elif start.dtype != dtype:
+        start = cast(start, dtype)
 
     if not isinstance(end, Variable):
         end = fill_constant([1], dtype, end)
-    elif convert_dtype(end.dtype) != dtype:
-        end = cast(x=end, dtype=dtype)
+    elif end.dtype != dtype:
+        end = cast(end, dtype)
 
     if not isinstance(step, Variable):
         step = fill_constant([1], dtype, step)
-    elif convert_dtype(step.dtype) != dtype:
-        step = cast(x=step, dtype=dtype)
+    elif step.dtype != dtype:
+        step = cast(step, dtype)
 
-    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+    if in_dygraph_mode():
+        return core.ops.range(start, end, step)
 
+    check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
+                'range/arange')
+    helper = LayerHelper('range', **locals())
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='range',
         inputs={'Start': start,
                 'End': end,
                 'Step': step},
-        outputs={'Out': [out]})
+        outputs={'Out': out})
     out.stop_gradient = True
     return out
 
@@ -1411,6 +1425,12 @@ def linspace(start, stop, num, dtype=None, name=None):
         the data shape of this tensor is :math:`[num]` . If the :attr:`num` is set 1, the output tensor just has \
         the value with input :attr:`start`. 
 
+    Raises:
+        TypeError: The dtype must be one of float32 and float64.
+        TypeError: The dtype of `start` and `stop`  must be one of float32 and float64.
+        TypeError: The dtype of `num` must be one of int32 and int64.
+
+
     Examples:
         .. code-block:: python
 
@@ -1538,7 +1558,11 @@ def diag(diagonal):
     return out
 
 
-def eye(num_rows, num_columns=None, batch_shape=None, dtype='float32'):
+def eye(num_rows,
+        num_columns=None,
+        batch_shape=None,
+        dtype='float32',
+        name=None):
     """
 	:alias_main: paddle.eye
 	:alias: paddle.eye,paddle.tensor.eye,paddle.tensor.creation.eye
@@ -1546,19 +1570,25 @@ def eye(num_rows, num_columns=None, batch_shape=None, dtype='float32'):
 
     **eye**
 
-    This function constructs an identity tensor, or a batch of tensor.
+    This function constructs a or a batch of 2-D tensor with ones on the diagonal and zeros elsewhere. 
 
     Args:
         num_rows(int): the number of rows in each batch tensor.
-        num_columns(int): the number of columns in each batch tensor.
-                          If None, default: num_rows.
-        batch_shape(list(int)): If provided, the returned tensor will have a leading
-                                batch size of this shape.
-        dtype(string): The data type of the returned tensor.
-                       It should be int32, int64, float16, float32, float64.
+        num_columns(int, optional): the number of columns in each batch tensor.
+            If None, default: num_rows.
+        batch_shape(list(int), optional): If provided, the returned tensor will have a leading
+            batch size of this shape, default is None.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of the returned tensor.
+            It should be int32, int64, float16, float32, float64, default is 'float32'.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
 
     Returns:
         Variable: An identity Tensor or LoDTensor of shape batch_shape + [num_rows, num_columns].
+    Raises:
+        TypeError: The `dtype` must be one of float16, float32, float64, int32 and int64.
+        TypeError: The `num_columns` must be non-negative int.
 
     Examples:
         .. code-block:: python
@@ -1579,38 +1609,55 @@ def eye(num_rows, num_columns=None, batch_shape=None, dtype='float32'):
 
     """
 
-    helper = LayerHelper("eye", **locals())
-    if not isinstance(num_rows, int) or num_rows < 0:
-        raise TypeError("num_rows should be a non-negative int")
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
     if num_columns is not None:
         if not isinstance(num_columns, int) or num_columns < 0:
             raise TypeError("num_columns should be a non-negative int")
     else:
         num_columns = num_rows
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-    c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='eye',
-        inputs={},
-        outputs={'Out': [out]},
-        attrs={
-            'num_rows': num_rows,
-            'num_columns': num_columns,
-            'dtype': c_dtype
-        },
-        stop_gradient=True)
-    out.stop_gradient = True
+
+    if in_dygraph_mode():
+        out = core.ops.eye('dtype', dtype, 'num_rows', num_rows, 'num_columns',
+                           num_columns)
+
+    else:
+        helper = LayerHelper("eye", **locals())
+        check_dtype(dtype, 'dtype',
+                    ['float16', 'float32', 'float64', 'int32', 'int64'], 'eye')
+        if not isinstance(num_rows, int) or num_rows < 0:
+            raise TypeError("num_rows should be a non-negative int")
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+        helper.append_op(
+            type='eye',
+            inputs={},
+            outputs={'Out': [out]},
+            attrs={
+                'num_rows': num_rows,
+                'num_columns': num_columns,
+                'dtype': dtype
+            },
+            stop_gradient=True)
 
     if batch_shape is not None:
+        re_shape = [1] * len(batch_shape)
+        re_shape = re_shape + [num_rows, num_columns]
+        expand_times = batch_shape + [1, 1]
+        if in_dygraph_mode():
+            out = core.ops.reshape(out, 'shape', re_shape)
+            return core.ops.expand(out, 'expand_times', expand_times)
+
         if not isinstance(batch_shape, list):
             raise TypeError("batch_shape should be a list")
-        from .nn import stack
-        for batch_val in reversed(batch_shape):
+        for batch_val in (batch_shape):
             if batch_val <= 0:
                 raise TypeError("batch_shape should be a positive int list")
-            else:
-                stack_vars = [out for _ in numpy.arange(batch_val)]
-                out = stack(stack_vars, axis=0)
+
+        from .nn import reshape, expand
+        out = reshape(x=out, shape=re_shape)
+        out = expand(x=out, expand_times=expand_times)
+
+    out.stop_gradient = True
     return out
 
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 165c44b96407b..6e7a90e44e5f2 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4884,48 +4884,272 @@ def minimize(self, loss, startup_program=None):
                 inputs={"X": fast_var},
                 outputs={"Out": slow_var})
 
-        # Add Var k to main prog and startup prog
-        k = layers.create_global_var(
-            name="lookahead_k",
-            shape=[1],
-            value=int(self.k),
-            dtype='int32',
-            persistable=True)
+        with framework.program_guard(main_block.program, startup_program):
+            # Add Var k to main prog and startup prog
+            k = layers.create_global_var(
+                name="lookahead_k",
+                shape=[1],
+                value=int(self.k),
+                dtype='int32',
+                persistable=True)
 
-        # Add Var alpha to main prog and startup prog
-        alpha = layers.create_global_var(
-            name="lookahead_alpha",
-            shape=[1],
-            value=float(self.alpha),
-            dtype='float32',
-            persistable=True)
+            # Add Var alpha to main prog and startup prog
+            alpha = layers.create_global_var(
+                name="lookahead_alpha",
+                shape=[1],
+                value=float(self.alpha),
+                dtype='float32',
+                persistable=True)
 
-        # Add Var step
-        step = layers.create_global_var(
-            name="lookahead_step",
-            shape=[1],
-            value=int(0),
-            dtype='int32',
-            persistable=True)
-        layers.increment(x=step, value=1.0, in_place=True)
-
-        # lookahead
-        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
-
-        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
-
-        mod = layers.elementwise_mod(step, k)
-        with layers.control_flow.Switch() as switch:
-            with switch.case(mod == zero_var):
-                for param_name in params:
-                    fast_var = main_block.var(param_name)
-                    slow_var = param_to_slow[param_name]
-                    tmp_var = layers.elementwise_add(
-                        layers.elementwise_mul(fast_var, alpha),
-                        layers.elementwise_mul(
-                            slow_var, layers.elementwise_sub(one_var, alpha)))
-                    layers.assign(input=tmp_var, output=slow_var)
-                    layers.assign(input=tmp_var, output=fast_var)
-            with switch.default():
-                pass
+            # Add Var step
+            step = layers.create_global_var(
+                name="lookahead_step",
+                shape=[1],
+                value=int(0),
+                dtype='int32',
+                persistable=True)
+            layers.increment(x=step, value=1.0, in_place=True)
+
+            # lookahead
+            zero_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+
+            one_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+
+            mod = layers.elementwise_mod(step, k)
+            with layers.control_flow.Switch() as switch:
+                with switch.case(mod == zero_var):
+                    for param_name in params:
+                        fast_var = main_block.var(param_name)
+                        slow_var = param_to_slow[param_name]
+                        tmp_var = layers.elementwise_add(
+                            layers.elementwise_mul(fast_var, alpha),
+                            layers.elementwise_mul(
+                                slow_var,
+                                layers.elementwise_sub(one_var, alpha)))
+                        layers.assign(input=tmp_var, output=slow_var)
+                        layers.assign(input=tmp_var, output=fast_var)
+                with switch.default():
+                    pass
         return mini_out
+
+
+class GradientMergeOptimizer(object):
+    """
+    Gradient Merge, also called as Gradient Accumulation,
+    is a training strategy for larger batches. With this strategy,
+    the parameter will not be updated until specific steps.
+
+    For each step, the forward network and the backward network
+    will run to calculate the gradient of the parameters.
+
+    For every k step, the optimization network will run,
+    applying a specific optimization method (such as SGD, Adam)
+    to the parameters.
+
+    Args:
+        inner_optimizer (Optimizer): The specific optimization (such as SGD, Adam)
+            which update the parameters
+        k_steps (int): the update period of the parameters
+        avg (bool): whether to average the gradients of each mini-batch,
+            the default value is `True`
+
+    Examples:
+        .. code-block:: python
+
+        import paddle.fluid as fluid
+        import numpy as np
+
+        def gen_data(batch_size):
+            return {"x": np.random.random(size=(batch_size, 32)).astype('float32'),
+                    "y": np.random.random(size=(batch_size, 1)).astype('int64')}
+
+        def mlp(input_x, input_y, hid_dim=128, label_dim=2):
+            fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
+            prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
+            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+            sum_cost = fluid.layers.reduce_mean(cost)
+            return sum_cost, fc_1, prediction
+
+        input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
+        input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+        cost, fc_1, pred = mlp(input_x, input_y)
+        sgd = fluid.optimizer.Adam(learning_rate=0.01)
+        sgd = fluid.optimizer.GradientMergeOptimizer(sgd, k_steps=4, avg=True)
+        sgd.minimize(cost)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        for i in range(10):
+            cost_val = exe.run(feed=gen_data(32),
+                       program=fluid.default_main_program(),
+                       fetch_list=[cost.name])
+            print("step=%d, cost=%f" % (i, cost_val[0]))
+    """
+
+    def __init__(self, inner_optimizer, k_steps=1, avg=True):
+        if framework.in_dygraph_mode():
+            raise Exception(
+                "In dygraph, we don't support GradientMergeOptimizer."
+                "You can do Gradient merge by yourself with k-times forward + backward, "
+                "and one-time optimizer.minimize()")
+
+        assert (inner_optimizer is not None), "inner optimizer can not be None"
+        assert (isinstance(k_steps, int) and
+                k_steps > 0), "k_steps should be a positive integer"
+
+        self.inner_optimizer = inner_optimizer
+        self.k_steps = k_steps
+        self.type = "gradient_merge"
+        self.avg = avg
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+
+        assert isinstance(loss, Variable), "The loss should be an Variable."
+        assert (
+            parameter_list is None
+        ), "The parameter_list should be None when using GradientMergeOptimizer"
+        assert (
+            no_grad_set is None
+        ), "The no_grad_set should be None when using GradientMergeOptimizer"
+
+        params_grads = self.inner_optimizer.backward(
+            loss, startup_program=startup_program)
+
+        #TODO(mapingshuo) support sparse embedding
+        for k, v in params_grads:
+            assert (
+                v.type != core.VarDesc.VarType.SELECTED_ROWS
+            ), "SELECTED_ROWS is not supported in GradientMergeOptimizer for now"
+
+        param_to_grad = {k.name: v for (k, v) in params_grads}
+
+        # Get startup_program and main_program
+        if startup_program is None:
+            startup_program = default_startup_program()
+        main_block = loss.block
+
+        # add some vars to the main_program and startup_program
+        startup_block = startup_program.global_block()
+        param_names = param_to_grad.keys()
+        param_to_gradient_merge = {}
+
+        for param_name in param_names:
+            param_var = main_block.var(param_name)
+            assert (param_var is not None)
+            gradient_merge_var = main_block.create_var(
+                name=param_name + "@GRAD@GradientMerge",
+                shape=param_var.shape,
+                dtype=param_var.dtype,
+                persistable=True)
+            param_to_gradient_merge[param_name] = gradient_merge_var
+            startup_gradient_merge_var = startup_block.create_var(
+                name=param_name + "@GRAD@GradientMerge",
+                shape=param_var.shape,
+                dtype=param_var.dtype,
+                persistable=True)
+            startup_block.append_op(
+                type="fill_constant",
+                outputs={"Out": startup_gradient_merge_var},
+                attrs={
+                    "shape": param_var.shape,
+                    "dtype": param_var.dtype,
+                    "value": float(0),
+                })
+
+        with framework.program_guard(main_block.program, startup_program):
+            # Add Var k to main prog and startup prog
+            gradient_merge_k = layers.create_global_var(
+                name="gradient_merge_k",
+                shape=[1],
+                value=int(self.k_steps),
+                dtype='int32',
+                persistable=True)
+
+            # Add Var step
+            gradient_merge_step = layers.create_global_var(
+                name="gradient_merge_step",
+                shape=[1],
+                value=int(0),
+                dtype='int32',
+                persistable=True)
+            layers.increment(x=gradient_merge_step, value=1.0, in_place=True)
+
+            # gradient merge
+            zero_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+            one_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+
+            mod = layers.elementwise_mod(gradient_merge_step, gradient_merge_k)
+            with layers.control_flow.Switch() as switch:
+                with switch.case(mod != zero_var):
+                    # 1. update the gradient_merge_vars
+                    #  gradient_merge_vars += gradient_vars
+                    cur_block = main_block.program.current_block()
+                    for param_name in param_names:
+                        grad = param_to_grad[param_name]
+                        grad_merge = param_to_gradient_merge[param_name]
+                        cur_block.append_op(
+                            type="elementwise_add",
+                            inputs={'X': grad,
+                                    'Y': grad_merge},
+                            outputs={'Out': grad_merge},
+                            attrs={'axis': -1,
+                                   'use_mkldnn': False})
+
+                with switch.default():
+                    # 1. update the graient_vars
+                    #     gradient_vars += gradient_merge_vars
+                    cur_block_idx = main_block.program.current_block_idx
+                    cur_block = main_block.program.current_block()
+                    for param_name in param_names:
+                        grad = param_to_grad[param_name]
+                        grad_merge = param_to_gradient_merge[param_name]
+                        if self.avg:
+                            tmp_var = layers.elementwise_add(grad, grad_merge)
+                            cur_block.append_op(
+                                type='scale',
+                                inputs={'X': tmp_var},
+                                outputs={'Out': grad},
+                                attrs={
+                                    'scale': 1.0 / self.k_steps,
+                                    'bias': 0.0,
+                                    'bias_after_scale': False
+                                })
+                        else:
+                            cur_block.append_op(
+                                type="elementwise_add",
+                                inputs={'X': grad,
+                                        'Y': grad_merge},
+                                outputs={'Out': grad},
+                                attrs={'axis': -1,
+                                       'use_mkldnn': False})
+
+                    # 2. apply_optimize
+                    target_grad_block = main_block.program._create_block(
+                        parent_idx=cur_block.parent_idx)
+                    target_grad_block._set_forward_block_idx(cur_block_idx)
+                    main_block.program.current_block_idx = cur_block_idx
+
+                    optimize_ops = self.inner_optimizer.apply_optimize(
+                        loss,
+                        startup_program=startup_program,
+                        params_grads=params_grads)
+
+                    # 3. clear gradient_merge_vars
+                    for param_name in param_names:
+                        grad_merge = param_to_gradient_merge[param_name]
+                        layers.fill_constant(
+                            shape=grad_merge.shape,
+                            dtype=grad_merge.dtype,
+                            value=0.0,
+                            out=grad_merge)
+        return optimize_ops, params_grads
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1f59b081454f3..17893a12189c1 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -32,6 +32,9 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint)
 list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_base)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@@ -56,13 +59,10 @@ if(WIN32)
 
     # TODO: Fix these unittests failed on Windows
     LIST(REMOVE_ITEM TEST_OPS test_debugger)
-    list(REMOVE_ITEM TEST_OPS test_desc_clone)
     list(REMOVE_ITEM TEST_OPS test_fake_init_op)
     list(REMOVE_ITEM TEST_OPS test_merge_ids_op)
     list(REMOVE_ITEM TEST_OPS test_split_ids_op)
-    list(REMOVE_ITEM TEST_OPS test_program_code)
     LIST(REMOVE_ITEM TEST_OPS test_ref_by_trainer_id_op)
-    LIST(REMOVE_ITEM TEST_OPS test_math_op_patch_var_base)
 endif()
 
 if(APPLE OR WIN32)
@@ -339,6 +339,11 @@ if(WITH_DISTRIBUTE)
     py_test_modules(test_communicator_half_async MODULES test_communicator_half_async ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
     py_test_modules(test_communicator_sync MODULES test_communicator_sync ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
     py_test_modules(test_collective_optimizer MODULES test_collective_optimizer)
+    if(NOT APPLE)
+    	   py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
+    	   py_test_modules(test_fleet_meta_optimizer MODULES test_fleet_meta_optimizer ENVS ${dist_ENVS})
+    	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
+    endif(NOT APPLE)
     if(WITH_DGC)
         # if with dgc, test all dgc tests.
         # NOTE. dist dgc tests is already in DIST_TEST_OPS
@@ -374,16 +379,22 @@ py_test_modules(test_parallel_executor_transformer_auto_growth MODULES test_para
 py_test_modules(test_data_norm_op MODULES test_data_norm_op)
 py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
 
-if(NOT WIN32)
-    # TODO: fix these unittests failure on Windows
+# NOTE: These unittests will appear NaN steadily in windows CI. After analysis,
+# it is found that windows CI will run all the training unittests with the ON_INFER option turned on, 
+# which will not appear in other CIs. The calculation behavior of some ops in inference mode is 
+# inconsistent with that in non-inference mode.
+if(NOT ON_INFER)
     py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_executor_seresnext_base_cpu)
     py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES test_parallel_executor_seresnext_with_reduce_cpu)
     py_test_modules(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
-    py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
     set_tests_properties(test_parallel_executor_seresnext_base_cpu PROPERTIES TIMEOUT 900)
     set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu PROPERTIES TIMEOUT 750)
     set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu PROPERTIES TIMEOUT 750)
+endif()
 
+if(NOT WIN32)
+    # TODO: fix these unittests failure on Windows
+    py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
     py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer)
     # FIXME(zcd): temporally disable test_parallel_executor_fetch_feed in Windows CI because of the random failure.
     py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index 2520b3722882d..bb7e0ca2a0ca7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -42,7 +42,7 @@ def ops(self):
         # causes crush in dy2stat. Set it to True after fixing it.
         emb = Embedding(
             size=[self.dict_size, self.emb_dim],
-            is_sparse=False,
+            is_sparse=True,
             padding_idx=self.padding_idx,
             param_attr=attr.ParamAttr(
                 name=self.name, initializer=fluid.initializer.Xavier()))
@@ -492,25 +492,16 @@ def forward(self, left, right):
         left_soft = softsign_layer.ops(bow_left)
         right_soft = softsign_layer.ops(bow_right)
 
-        left_bow = self.bow_layer(left_soft)
-        right_bow = self.bow_layer(right_soft)
-        cos_sim_layer = CosSimLayer()
-        pred = cos_sim_layer.ops(left_bow, right_bow)
-        return left_bow, pred
-
-        # TODO(huihuangzheng): uncomment the following return statements after
-        # we fix it.
-        #
         # matching layer
-        #if self.task_mode == "pairwise":
-        #    left_bow = self.bow_layer(left_soft)
-        #    right_bow = self.bow_layer(right_soft)
-        #    cos_sim_layer = CosSimLayer()
-        #    pred = cos_sim_layer.ops(left_bow, right_bow)
-        #    return left_bow, pred
-        #else:
-        #    concat_layer = ConcatLayer(1)
-        #    concat = concat_layer.ops([left_soft, right_soft])
-        #    concat_fc = self.bow_layer_po(concat)
-        #    pred = self.softmax_layer(concat_fc)
-        #    return left_soft, pred
+        if self.task_mode == "pairwise":
+            left_bow = self.bow_layer(left_soft)
+            right_bow = self.bow_layer(right_soft)
+            cos_sim_layer = CosSimLayer()
+            pred = cos_sim_layer.ops(left_bow, right_bow)
+            return left_bow, pred
+        else:
+            concat_layer = ConcatLayer(1)
+            concat = concat_layer.ops([left_soft, right_soft])
+            concat_fc = self.bow_layer_po(concat)
+            pred = self.softmax_layer(concat_fc)
+            return left_soft, pred
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 0541c37fc71b0..09be10e6c8a7e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -108,12 +108,9 @@ def forward(self, inputs, label=None):
             loss = fluid.layers.cross_entropy(x, label)
             avg_loss = fluid.layers.mean(loss)
 
-        # TODO: Uncomment code after "return" statement can be transformed correctly.
-
-        #     return x, acc, avg_loss
-        # else:
-        #     return x
-        return x, acc, avg_loss
+            return x, acc, avg_loss
+        else:
+            return x
 
     def inference(self, inputs):
         x = self._simple_img_conv_pool_1(inputs)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
new file mode 100644
index 0000000000000..4813930159744
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gym
+import math
+import itertools
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.dygraph.nn as nn
+from paddle.fluid.dygraph import to_variable, Layer
+from paddle.fluid.dygraph import declarative, ProgramTranslator
+
+import unittest
+
+SEED = 2020
+program_translator = ProgramTranslator()
+
+
+class Policy(Layer):
+    def __init__(self):
+        super(Policy, self).__init__()
+
+        self.affine1 = nn.Linear(4, 128)
+        self.affine2 = nn.Linear(128, 2)
+        self.dropout_ratio = 0.6
+
+        self.saved_log_probs = []
+        self.rewards = []
+
+    @declarative
+    def forward(self, x):
+        x = fluid.layers.reshape(x, shape=[1, 4])
+        x = self.affine1(x)
+        x = fluid.layers.dropout(x, self.dropout_ratio)
+        x = fluid.layers.relu(x)
+        action_scores = self.affine2(x)
+
+        log_prob = fluid.layers.softmax(action_scores, axis=1)
+
+        return log_prob
+
+
+class Args(object):
+    gamma = 0.99
+    log_interval = 1
+    train_step = 10
+
+
+def train(args, place, to_static):
+    program_translator.enable(to_static)
+
+    env = gym.make('CartPole-v0')
+    env.seed(SEED)
+
+    with fluid.dygraph.guard(place):
+        fluid.default_main_program().random_seed = SEED
+        fluid.default_startup_program().random_seed = SEED
+        local_random = np.random.RandomState(SEED)
+
+        policy = Policy()
+
+        eps = np.finfo(np.float32).eps.item()
+        optimizer = fluid.optimizer.AdamaxOptimizer(
+            learning_rate=1e-2, parameter_list=policy.parameters())
+
+        def get_mean_and_std(values=[]):
+            n = 0.
+            s = 0.
+            for val in values:
+                s += val
+                n += 1
+            mean = s / n
+
+            std = 0.
+            for val in values:
+                std += (val - mean) * (val - mean)
+            std /= n
+            std = math.sqrt(std)
+
+            return mean, std
+
+        def sample_action(probs):
+            sample = local_random.random_sample()
+            idx = 0
+
+            while idx < len(probs) and sample > probs[idx]:
+                sample -= probs[idx]
+                idx += 1
+            mask = [0.] * len(probs)
+            mask[idx] = 1.
+
+            return idx, np.array([mask]).astype("float32")
+
+        def choose_best_action(probs):
+            idx = 0 if probs[0] > probs[1] else 1
+            mask = [1., 0.] if idx == 0 else [0., 1.]
+
+            return idx, np.array([mask]).astype("float32")
+
+        def select_action(state):
+            state = to_variable(state)
+            state.stop_gradient = True
+            loss_probs = policy(state)
+
+            probs = loss_probs.numpy()
+
+            action, _mask = sample_action(probs[0])
+            mask = to_variable(_mask)
+            mask.stop_gradient = True
+
+            loss_probs = fluid.layers.log(loss_probs)
+            loss_probs = fluid.layers.elementwise_mul(loss_probs, mask)
+            loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1)
+
+            policy.saved_log_probs.append(loss_probs)
+            return action, loss_probs
+
+        def finish_episode():
+            R = 0
+            policy_loss = []
+            returns = []
+            for r in policy.rewards[::-1]:
+                R = r + args.gamma * R
+                returns.insert(0, R)
+
+            mean, std = get_mean_and_std(returns)
+
+            returns = np.array(returns).astype("float32")
+            returns = (returns - mean) / (std + eps)
+
+            # calculate policy loss of each step.
+            for log_prob, R in zip(policy.saved_log_probs, returns):
+                log_prob_numpy = log_prob.numpy()
+
+                R_numpy = np.ones_like(log_prob_numpy).astype("float32")
+                _R = -1 * R * R_numpy
+                _R = to_variable(_R)
+                _R.stop_gradient = True
+                cur_loss = fluid.layers.elementwise_mul(_R, log_prob)
+                policy_loss.append(cur_loss)
+
+            policy_loss = fluid.layers.concat(policy_loss)
+            policy_loss = fluid.layers.reduce_sum(policy_loss)
+
+            policy_loss.backward()
+            optimizer.minimize(policy_loss)
+            policy.clear_gradients()
+
+            del policy.rewards[:]
+            del policy.saved_log_probs[:]
+
+            return returns
+
+        loss_data = []
+        running_reward = 10
+        for i_episode in itertools.count(1):
+            state, ep_reward = env.reset(), 0
+            # The default loop number is 10000 is models, we changed it to 1000 for smaller test
+            for t in range(1, 1000):
+                state = np.array(state).astype("float32")
+                action, loss = select_action(state)
+                state, reward, done, _ = env.step(action)
+
+                # log loss_probs
+                loss_data.append(loss.numpy()[0])
+
+                policy.rewards.append(reward)
+                ep_reward += reward
+
+                if done:
+                    break
+
+            # sum loss and apply optimization
+            returns = finish_episode()
+
+            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
+            if i_episode % args.log_interval == 0:
+                print(
+                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'.
+                    format(i_episode, ep_reward, running_reward,
+                           loss.numpy()[0]))
+
+            if i_episode > args.train_step:
+                break
+
+        return np.array(loss_data)
+
+
+class TestDeclarative(unittest.TestCase):
+    def setUp(self):
+        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
+            else fluid.CPUPlace()
+        self.args = Args()
+
+    def test_train(self):
+        st_out = train(self.args, self.place, to_static=True)
+        dy_out = train(self.args, self.place, to_static=False)
+        self.assertTrue(
+            np.allclose(st_out, dy_out),
+            msg="dy_out:\n {}\n st_out:\n{}\n".format(dy_out, st_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
index 94b9bb86be241..552a6307f3337 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
@@ -28,7 +28,7 @@
 
 def create_conf_dict():
     conf_dict = {}
-    conf_dict["task_mode"] = "train"
+    conf_dict["task_mode"] = "pairwise"
     conf_dict["net"] = {"emb_dim": 128, "bow_dim": 128, "hidden_dim": 128}
     conf_dict["loss"] = {"margin": 0.1}
     return conf_dict
@@ -149,7 +149,6 @@ def train(conf_dict, to_static):
             pred = pos_score
             _, neg_score = net(left, neg_right)
             avg_cost = loss.compute(pos_score, neg_score)
-            #avg_cost = loss.compute(pos_score, pos_score)
             losses.append(np.mean(avg_cost.numpy()))
             avg_cost.backward()
             optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 0b8ea1f9392e9..d3a53bbbff981 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -78,7 +78,10 @@ def _get_analysis_outputs(self, config):
             shape = tensor_shapes[name]
             shape[0] = 1
             tensor = predictor.get_input_tensor(name)
-            tensor.copy_from_cpu(list(self.feeds.values())[i])
+            feed_data = list(self.feeds.values())[i]
+            tensor.copy_from_cpu(np.array(feed_data))
+            if type(feed_data) == fluid.LoDTensor:
+                tensor.set_lod(feed_data.lod())
 
         predictor.zero_copy_run()
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
index 16979488a614e..d6dbd397b9036 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
@@ -44,7 +44,8 @@ def setUp(self):
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
index f4014f7cd42e4..2e9035420d7ee 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
@@ -46,7 +46,8 @@ def setUp(self):
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
index cea007d56e41b..7c4e0d6e76ec4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
@@ -42,7 +42,8 @@ def setUp(self):
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index 6444264f80fb5..dfcd1758db2b2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -42,7 +42,8 @@ def setUp(self):
     def test_check_output(self):
         # There is no cpu pass for transpose_flatten_concat_fuse
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
index 41f02b0427d68..4661333ffeca1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
@@ -48,7 +48,8 @@ def setUp(self):
     def test_check_output(self):
         # There is no cpu pass for transpose_flatten_concat_fuse
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 8dbdd2921b9a0..7d687dbd0f85f 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -183,6 +183,148 @@ def test_dygraph(self):
             self.assertEqual(z, z_expected)
 
 
+class TestSinh(TestActivation):
+    def setUp(self):
+        self.op_type = "sinh"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.sinh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out')
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([0.1])
+            x = fluid.dygraph.to_variable(np_x)
+            z = fluid.layers.sinh(x).numpy()
+            z_expected = np.sinh(np_x)
+            self.assertTrue(np.allclose(z, z_expected))
+
+    def test_api(self):
+        test_data_shape = [11, 17]
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input_x = np.random.uniform(0.1, 1,
+                                        test_data_shape).astype("float32")
+            data_x = fluid.layers.data(
+                name="data_x",
+                shape=test_data_shape,
+                append_batch_size=False,
+                dtype="float32")
+
+            pd_sinh_out = fluid.layers.sinh(data_x)
+            exe = fluid.Executor(place=fluid.CPUPlace())
+            exe.run(fluid.default_startup_program())
+            np_sinh_res = exe.run(fluid.default_main_program(),
+                                  feed={"data_x": input_x},
+                                  fetch_list=[pd_sinh_out])
+
+        expected_res = np.sinh(input_x)
+        self.assertTrue(np.allclose(np_sinh_res, expected_res))
+
+    def test_backward(self):
+        test_data_shape = [11, 17]
+        with fluid.dygraph.guard():
+            input_x = np.random.uniform(0.1, 1,
+                                        test_data_shape).astype("float32")
+            var = fluid.dygraph.to_variable(input_x)
+            var.stop_gradient = False
+            loss = fluid.layers.sinh(var)
+            loss.backward()
+            grad_var = var.gradient()
+            self.assertEqual(grad_var.shape, input_x.shape)
+
+
+class TestSinhOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, fluid.layers.sinh, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, fluid.layers.sinh, x_int32)
+            # support the input dtype is float16
+            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            fluid.layers.sinh(x_fp16)
+
+
+class TestCosh(TestActivation):
+    def setUp(self):
+        self.op_type = "cosh"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.cosh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out')
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([0.1])
+            x = fluid.dygraph.to_variable(np_x)
+            z = fluid.layers.cosh(x).numpy()
+            z_expected = np.cosh(np_x)
+            self.assertTrue(np.allclose(z, z_expected))
+
+    def test_api(self):
+        test_data_shape = [11, 17]
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input_x = np.random.uniform(0.1, 1,
+                                        test_data_shape).astype("float32")
+            data_x = fluid.layers.data(
+                name="data_x",
+                shape=test_data_shape,
+                append_batch_size=False,
+                dtype="float32")
+
+            pd_cosh_out = paddle.cosh(data_x)
+            exe = fluid.Executor(place=fluid.CPUPlace())
+            exe.run(fluid.default_startup_program())
+            np_cosh_res = exe.run(fluid.default_main_program(),
+                                  feed={"data_x": input_x},
+                                  fetch_list=[pd_cosh_out])
+
+        expected_res = np.cosh(input_x)
+        self.assertTrue(np.allclose(np_cosh_res, expected_res))
+
+    def test_backward(self):
+        test_data_shape = [11, 17]
+        with fluid.dygraph.guard():
+            input_x = np.random.uniform(0.1, 1,
+                                        test_data_shape).astype("float32")
+            var = fluid.dygraph.to_variable(input_x)
+            var.stop_gradient = False
+            loss = fluid.layers.cosh(var)
+            loss.backward()
+            grad_var = var.gradient()
+            self.assertEqual(grad_var.shape, input_x.shape)
+
+
+class TestCoshOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, fluid.layers.cosh, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, fluid.layers.cosh, x_int32)
+            # support the input dtype is float16
+            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            fluid.layers.cosh(x_fp16)
+
+
 class TestTanhShrink(TestActivation):
     def setUp(self):
         self.op_type = "tanh_shrink"
@@ -799,22 +941,15 @@ def test_api(self):
                 shape=[11, 17],
                 append_batch_size=False,
                 dtype="float64")
-            res_log1p = fluid.layers.data(
-                name="res_log1p",
-                shape=[11, 17],
-                append_batch_size=False,
-                dtype="float64")
 
             out1 = paddle.log1p(data_x)
-            out2 = paddle.log1p(data_x, out=res_log1p)
             exe = fluid.Executor(place=fluid.CPUPlace())
             exe.run(fluid.default_startup_program())
-            res1, res_in = exe.run(fluid.default_main_program(),
-                                   feed={"data_x": input_x},
-                                   fetch_list=[out1, res_log1p])
+            res1 = exe.run(fluid.default_main_program(),
+                           feed={"data_x": input_x},
+                           fetch_list=[out1])
         expected_res = np.log1p(input_x)
-        np.testing.assert_allclose(res1, expected_res)
-        np.testing.assert_allclose(res_in, expected_res)
+        self.assertTrue(np.allclose(res1, expected_res))
 
         # dygraph
         with fluid.dygraph.guard():
@@ -823,7 +958,7 @@ def test_api(self):
             z = paddle.log1p(data_x)
             np_z = z.numpy()
             z_expected = np.array(np.log1p(np_x))
-        np.testing.assert_allclose(np_z, z_expected)
+        self.assertTrue(np.allclose(np_z, z_expected))
 
 
 class TestSquare(TestActivation):
@@ -1211,8 +1346,10 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestCeil, grad_check=False)
 create_test_act_fp16_class(TestFloor, grad_check=False)
 create_test_act_fp16_class(TestCos, grad_atol=0.85)
+create_test_act_fp16_class(TestCosh, grad_atol=0.85)
 create_test_act_fp16_class(TestAcos, grad_atol=0.85)
 create_test_act_fp16_class(TestSin)
+create_test_act_fp16_class(TestSinh)
 create_test_act_fp16_class(TestAsin)
 create_test_act_fp16_class(TestAtan)
 create_test_act_fp16_class(TestRound, grad_check=False)
diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py
index d715744b02a01..1736e49f3b67b 100644
--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -15,7 +15,8 @@
 from __future__ import print_function
 
 import paddle
-import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle import program_guard, Program
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -44,47 +45,67 @@ def test_check_output(self):
         self.check_output()
 
 
-class TestFloatArangeOpCase0(TestArangeOp):
+class TestFloatArangeOp(TestArangeOp):
     def init_config(self):
         self.dtype = np.float32
         self.case = (0, 5, 1)
 
 
-class TestInt32ArangeOpCase0(TestArangeOp):
+class TestInt32ArangeOp(TestArangeOp):
     def init_config(self):
         self.dtype = np.int32
         self.case = (0, 5, 2)
 
 
-class TestInt32ArangeOpCase1(TestArangeOp):
+class TestFloat64ArangeOp(TestArangeOp):
     def init_config(self):
-        self.dtype = np.int32
+        self.dtype = np.float64
         self.case = (10, 1, -2)
 
 
-class TestInt32ArangeOpCase2(TestArangeOp):
+class TestInt64ArangeOp(TestArangeOp):
     def init_config(self):
-        self.dtype = np.int32
+        self.dtype = np.int64
         self.case = (-1, -10, -2)
 
 
+class TestArangeOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            self.assertRaises(TypeError, paddle.arange, 10, dtype='int8')
+
+
 class TestArangeAPI(unittest.TestCase):
     def test_out(self):
-        with fluid.program_guard(fluid.Program()):
-            data = paddle.arange(0, 5, 1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[data])
-            expected_data = np.arange(0, 5, 1).astype(np.float32)
-        self.assertEqual((result == expected_data).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = paddle.arange(0.0, 5.0, 1.0, 'int32')
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[data])
-            expected_data = np.arange(0, 5, 1).astype(np.int32)
-        self.assertEqual((result == expected_data).all(), True)
+        with program_guard(Program(), Program()):
+            x1 = paddle.arange(0, 5, 1, 'float32')
+
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            exe = paddle.Executor(place)
+            out = exe.run(fetch_list=[x1])
+
+        expected_data = np.arange(0, 5, 1).astype(np.float32)
+        self.assertEqual((out == expected_data).all(), True)
+
+
+class TestArangeImperative(unittest.TestCase):
+    def test_out(self):
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        with paddle.imperative.guard(place):
+            x1 = paddle.arange(0, 5, 1)
+            x2 = paddle.tensor.arange(5)
+            x3 = paddle.tensor.creation.arange(5)
+
+            start = paddle.imperative.to_variable(np.array([0], 'float32'))
+            end = paddle.imperative.to_variable(np.array([5], 'float32'))
+            step = paddle.imperative.to_variable(np.array([1], 'float32'))
+            x4 = paddle.arange(start, end, step, 'int64')
+
+        expected_data = np.arange(0, 5, 1).astype(np.int64)
+        for i in [x1, x2, x3, x4]:
+            self.assertEqual((i.numpy() == expected_data).all(), True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
new file mode 100644
index 0000000000000..51e447dba725c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
@@ -0,0 +1,194 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import math
+
+
+class Gsz:
+    def __init__(self, h, w, gd, gh, gw, input_chans):
+        self.h = h
+        self.w = w
+        self.gd = gd
+        self.gh = gh
+        self.gw = gw
+        self.input_chans = input_chans
+
+
+def diff_abs(x):
+    eps = 1e-8
+    return math.sqrt(x * x + eps)
+
+
+def d_diff_abs(x):
+    eps = 1e-8
+    return x / math.sqrt(x * x + eps)
+
+
+def weight_z(x):
+    abx = diff_abs(x)
+    return max(1.0 - abx, 0.0)
+
+
+def d_weight_z(x):
+    abx = diff_abs(x)
+    if abx > 1.0:
+        return 0.0
+    else:
+        return d_diff_abs(x)
+
+
+def naive_bilateral_slice_forward(output, grid, guide, input, gsz, has_offset,
+                                  total_count, output_chans):
+    h = gsz.h
+    w = gsz.w
+    gd = gsz.gd
+    gh = gsz.gh
+    gw = gsz.gw
+    input_chans = gsz.input_chans
+    coeff_stride = input_chans
+    grid_chans = input_chans * output_chans
+
+    if has_offset:
+        grid_chans += output_chans
+        coeff_stride += 1
+
+    for idx in range(total_count):
+        x = idx % w
+        y = idx // w % h
+        out_c = (idx // (h * w)) % output_chans
+        b = (idx // (output_chans * w * h))
+
+        gx = (x + 0.5) * gw / (1.0 * w)
+        gy = (y + 0.5) * gh / (1.0 * h)
+        gz = guide[int(b), int(y), int(x)] * gd
+
+        fx = int(np.floor(gx - 0.5))
+        fy = int(np.floor(gy - 0.5))
+        fz = int(np.floor(gz - 0.5))
+
+        value = 0.0
+        for in_c in range(0, coeff_stride):
+            coeff_sample = 0.0
+
+            for xx in range(fx, fx + 2):
+                x_ = max(min(xx, gw - 1), 0)
+                wx = max(1.0 - abs(xx + 0.5 - gx), 0.0)
+
+                for yy in range(fy, fy + 2):
+                    y_ = max(min(yy, gh - 1), 0)
+                    wy = max(1.0 - abs(yy + 0.5 - gy), 0.0)
+
+                    for zz in range(fz, fz + 2):
+                        z_ = max(min(zz, gd - 1), 0)
+                        wz = weight_z(zz + 0.5 - gz)
+                        c_ = coeff_stride * out_c + in_c
+
+                        coeff_sample += grid[int(b), int(c_), int(z_), int(y_),
+                                             int(x_)] * wx * wy * wz
+
+            if in_c < input_chans:
+                value += coeff_sample * input[int(b), int(in_c), int(y), int(x)]
+            else:
+                value += coeff_sample
+        output[int(b), int(out_c), int(y), int(x)] = value
+
+
+def naive_bilateral_slice(x, guide, grid, has_offset):
+    bs = x.shape[0]
+    h = x.shape[2]
+    w = x.shape[3]
+    input_chans = x.shape[1]
+
+    coeffs_chans = grid.shape[1]
+    if has_offset:
+        output_chans = coeffs_chans // (input_chans + 1)
+    else:
+        output_chans = coeffs_chans // input_chans
+
+    output = np.zeros([bs, int(output_chans), h, w]).astype(x.dtype)
+
+    gd = grid.shape[2]
+    gh = grid.shape[3]
+    gw = grid.shape[4]
+
+    gsz = Gsz(h, w, gd, gh, gw, input_chans)
+    total_count = bs * h * w * output.shape[1]
+    naive_bilateral_slice_forward(output, grid, guide, x, gsz, has_offset,
+                                  total_count, output.shape[1])
+    return output
+
+
+@unittest.skipIf(not paddle.fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestBilateralSliceOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'bilateral_slice'
+        batch_size = 3
+        h = 50
+        w = 30
+        c = 1
+        gh = 5
+        gw = 3
+        gd = 2
+        gc = 2
+        x = np.random.rand(batch_size, c, h, w).astype(self.data_type)
+        guide = np.random.rand(batch_size, h, w).astype(self.data_type)
+        grid = np.random.rand(batch_size, gc, gd, gh, gw).astype(self.data_type)
+        output_np = naive_bilateral_slice(x, guide, grid, self.has_offset)
+
+        self.inputs = {'X': x, 'Grid': grid, 'Guide': guide}
+        self.attrs = {'has_offset': self.has_offset, }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        place = paddle.fluid.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-5)
+        self.check_output
+
+    def test_check_grad(self):
+        place = paddle.fluid.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+    def initTestCase(self):
+        self.has_offset = False
+        self.data_type = 'float64'
+
+
+@unittest.skipIf(not paddle.fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestBilateralSliceOp1(TestBilateralSliceOp):
+    def initTestCase(self):
+        self.has_offset = True
+        self.data_type = 'float32'
+
+
+class TestBilateralSliceApi(TestBilateralSliceOp):
+    def test_api(self):
+        x = paddle.fluid.data(
+            name='x', shape=[None, 3, 25, 15], dtype='float32')
+        guide = paddle.fluid.data(
+            name='guide', shape=[None, 25, 15], dtype='float32')
+        grid = paddle.fluid.data(
+            name='grid', shape=[None, 12, 8, 5, 3], dtype='float32')
+        paddle.fluid.contrib.layers.bilateral_slice(x, guide, grid,
+                                                    self.has_offset)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cross_op.py b/python/paddle/fluid/tests/unittests/test_cross_op.py
index 66509a863ab74..8e53a36f0510d 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_op.py
@@ -79,7 +79,7 @@ def test_cross_api(self):
         with program_guard(Program(), Program()):
             x = fluid.layers.data(name='x', shape=[-1, 3])
             y = fluid.layers.data(name='y', shape=[-1, 3])
-            z = paddle.cross(x, y, dim=1)
+            z = paddle.cross(x, y, axis=1)
             exe = fluid.Executor(fluid.CPUPlace())
             res, = exe.run(feed={'x': self.data_x,
                                  'y': self.data_y},
@@ -103,6 +103,14 @@ def test_cross_api(self):
                                [-1.0, -1.0, -1.0]])
         self.assertTrue(np.allclose(expect_out, np.array(res)))
 
+        # case 3:
+        with program_guard(Program(), Program()):
+            x = fluid.data(name="x", shape=[-1, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[-1, 3], dtype='float32')
+
+            y_1 = paddle.cross(x, y, name='result')
+            self.assertEqual(('result' in y_1.name), True)
+
     def test_dygraph_api(self):
         self.input_data()
         # case 1:
@@ -119,7 +127,7 @@ def test_dygraph_api(self):
         with fluid.dygraph.guard():
             x = fluid.dygraph.to_variable(self.data_x)
             y = fluid.dygraph.to_variable(self.data_y)
-            z = paddle.cross(x, y, dim=1)
+            z = paddle.cross(x, y, axis=1)
             np_z = z.numpy()
         expect_out = np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0],
                                [0.0, 0.0, 0.0]])
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index 43bf39ccd5379..6b49de536ad39 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -18,6 +18,7 @@
 import argparse
 import time
 import math
+import sys
 
 import paddle
 import paddle.fluid as fluid
@@ -177,6 +178,8 @@ def program_equal(a, b):
 
 
 class TestDistMnist(unittest.TestCase):
+    @unittest.skipIf(sys.platform == "win32",
+                     "Windows does not support distribution")
     def test_desc_clone(self):
         get_model(batch_size=20)
 
diff --git a/python/paddle/fluid/tests/unittests/test_eye_op.py b/python/paddle/fluid/tests/unittests/test_eye_op.py
index fbbf01abae638..1a0a4ecb74d56 100644
--- a/python/paddle/fluid/tests/unittests/test_eye_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eye_op.py
@@ -74,32 +74,73 @@ def test_check_output(self):
 
 class API_TestTensorEye(unittest.TestCase):
     def test_out(self):
-        with fluid.program_guard(fluid.Program()):
+        with paddle.program_guard(paddle.Program()):
             data = paddle.eye(10)
             place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
+            exe = paddle.Executor(place)
             result, = exe.run(fetch_list=[data])
             expected_result = np.eye(10, dtype="float32")
         self.assertEqual((result == expected_result).all(), True)
 
-        with fluid.program_guard(fluid.Program()):
+        with paddle.program_guard(paddle.Program()):
             data = paddle.eye(10, num_columns=7, dtype="float64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
             result, = exe.run(fetch_list=[data])
             expected_result = np.eye(10, 7, dtype="float64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with fluid.program_guard(fluid.Program()):
+        with paddle.program_guard(paddle.Program()):
             data = paddle.eye(10, dtype="int64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
             result, = exe.run(fetch_list=[data])
             expected_result = np.eye(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
 
+        with paddle.imperative.guard():
+            out = paddle.eye(10, dtype="int64")
+            expected_result = np.eye(10, dtype="int64")
+        self.assertEqual((out.numpy() == expected_result).all(), True)
+
+        with paddle.imperative.guard():
+            batch_shape = [2]
+            out = fluid.layers.eye(10,
+                                   10,
+                                   dtype="int64",
+                                   batch_shape=batch_shape)
+            result = np.eye(10, dtype="int64")
+            expected_result = []
+            for index in reversed(batch_shape):
+                tmp_result = []
+                for i in range(index):
+                    tmp_result.append(result)
+                result = tmp_result
+                expected_result = np.stack(result, axis=0)
+        self.assertEqual(out.numpy().shape == np.array(expected_result).shape,
+                         True)
+        self.assertEqual((out.numpy() == expected_result).all(), True)
+
+        with paddle.imperative.guard():
+            batch_shape = [3, 2]
+            out = fluid.layers.eye(10,
+                                   10,
+                                   dtype="int64",
+                                   batch_shape=batch_shape)
+            result = np.eye(10, dtype="int64")
+            expected_result = []
+            for index in reversed(batch_shape):
+                tmp_result = []
+                for i in range(index):
+                    tmp_result.append(result)
+                result = tmp_result
+                expected_result = np.stack(result, axis=0)
+        self.assertEqual(out.numpy().shape == np.array(expected_result).shape,
+                         True)
+        self.assertEqual((out.numpy() == expected_result).all(), True)
+
     def test_errors(self):
-        with fluid.program_guard(fluid.Program()):
+        with paddle.program_guard(paddle.Program()):
 
             def test_num_rows_type_check():
                 paddle.eye(-1, dtype="int64")
@@ -111,6 +152,11 @@ def test_num_columns_type_check():
 
             self.assertRaises(TypeError, test_num_columns_type_check)
 
+            def test_num_columns_type_check():
+                paddle.eye(10, num_columns=10, dtype="int8")
+
+            self.assertRaises(TypeError, test_num_columns_type_check)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 0bd3516e48d2c..2ca3729306e1b 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -83,26 +83,6 @@ def test_check_output(self):
         self.check_output()
 
 
-class TestFillConstantOp5(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program()):
-            out_np = np.zeros(shape=(1), dtype='float32')
-            out = paddle.zeros(shape=[1], dtype="float32")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result = exe.run(fetch_list=[out])
-            self.assertEqual((result == out_np).all(), True)
-        with program_guard(Program()):
-            data = fluid.data(name="X", shape=[1], dtype="float32")
-            out = paddle.ones(shape=[1], out=data, dtype="float32")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result = exe.run(feed={"X": np.array(
-                [0.1], dtype="float32")},
-                             fetch_list=[data, out])
-            self.assertEqual(result[0], result[1])
-
-
 class TestFillConstantOpWithSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
new file mode 100644
index 0000000000000..20542da3f05ec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+
+
+class TestFleetBase(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_init(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+
+    def test_is_first_worker(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_first_worker():
+            print("test fleet first worker done.")
+
+    def test_worker_index(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        print(fleet.worker_index())
+
+    def test_worker_num(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        print(fleet.worker_num())
+
+    def test_is_worker(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_worker():
+            print("test fleet is worker")
+
+    def test_worker_endpoints(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        print(fleet.worker_endpoints(to_string=True))
+
+    def test_server_num(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_server():
+            print("fleet server num: {}".format(fleet.server_num()))
+
+    def test_server_index(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_server():
+            print("fleet server index: {}".format(fleet.server_index()))
+
+    def test_server_endpoints(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_server():
+            print("fleet server index: {}".format(
+                fleet.server_endpoints(to_string=True)))
+
+    def test_is_server(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_server():
+            print("test fleet is server")
+
+    def test_util(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        self.assertEqual(fleet.util, None)
+
+    def test_barrier_worker(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_worker():
+            fleet.barrier_worker()
+
+    def test_init_worker(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_worker():
+            fleet.init_worker()
+
+    def test_run_server(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_worker():
+            fleet.run_worker()
+
+    def test_stop_worker(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        if fleet.is_worker():
+            fleet.stop_worker()
+
+    def test_distributed_optimizer(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        strategy = fleet.DistributedStrategy()
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+
+    def test_minimize(self):
+        import paddle
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        strategy = fleet.DistributedStrategy()
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 0668546a703bc..bac03176c8da1 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -109,6 +109,13 @@ def test_hierachical_allreduce(self):
         strategy.hierachical_allreduce = "True"
         self.assertEqual(strategy.hierachical_allreduce, False)
 
+    def test_hierachical_allreduce_inter_ranks(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.hierachical_allreduce_inter_ranks = 1
+        self.assertEqual(strategy.hierachical_allreduce_inter_ranks, 1)
+        strategy.hierachical_allreduce_inter_ranks = "2"
+        self.assertEqual(strategy.hierachical_allreduce_inter_ranks, 1)
+
     def test_nccl_comm_num(self):
         strategy = paddle.fleet.DistributedStrategy()
         strategy.nccl_comm_num = 1
@@ -220,6 +227,13 @@ def test_num_iteration_per_drop_scope(self):
         strategy.num_iteration_per_drop_scope = 0.1
         self.assertEqual(strategy.num_iteration_per_drop_scope, 1)
 
+    def test_num_iteration_per_run(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.num_iteration_per_run = 1
+        self.assertEqual(strategy.num_iteration_per_run, 1)
+        strategy.num_iteration_per_run = 0.1
+        self.assertEqual(strategy.num_iteration_per_run, 1)
+
     def test_sync_batch_norm(self):
         strategy = paddle.fleet.DistributedStrategy()
         strategy.sync_batch_norm = True
@@ -336,6 +350,40 @@ def test_auto(self):
         strategy.auto = "True"
         self.assertEqual(strategy.auto, False)
 
+    def test_sync_nccl_allreduce(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.sync_nccl_allreduce = True
+        self.assertEqual(strategy.sync_nccl_allreduce, True)
+        strategy.sync_nccl_allreduce = False
+        self.assertEqual(strategy.sync_nccl_allreduce, False)
+        strategy.sync_nccl_allreduce = "True"
+        self.assertEqual(strategy.sync_nccl_allreduce, False)
+
+    def test_fuse_broadcast_ops(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.fuse_broadcast_ops = True
+        self.assertEqual(strategy.fuse_broadcast_ops, True)
+        strategy.fuse_broadcast_ops = False
+        self.assertEqual(strategy.fuse_broadcast_ops, False)
+        strategy.fuse_broadcast_ops = "True"
+        self.assertEqual(strategy.fuse_broadcast_ops, False)
+
+    def test_num_threads(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.num_threads = 1
+        self.assertEqual(strategy.num_threads, 1)
+        strategy.num_threads = 0.1
+        self.assertEqual(strategy.num_threads, 1)
+
+    def test_strategy_prototxt(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.sync_nccl_allreduce = True
+        strategy.save_to_prototxt("dist_strategy.prototxt")
+        strategy2 = paddle.fleet.DistributedStrategy()
+        strategy2.load_from_prototxt("dist_strategy.prototxt")
+        self.assertEqual(strategy.sync_nccl_allreduce,
+                         strategy2.sync_nccl_allreduce)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer.py
new file mode 100644
index 0000000000000..9cb300f83d9c4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+
+
+class TestFleetMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_graph_execution_optimizer(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.fleet.DistributedStrategy()
+
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+    def test_recompute_optimizer(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.recompute = True
+        strategy.recompute_checkpoints = [fc_2]
+
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_metric.py b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
new file mode 100644
index 0000000000000..6e5feece93fc5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_metric.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test fleet metric."""
+
+from __future__ import print_function
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import os
+import unittest
+import paddle.fleet.metrics.metric as metric
+from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
+from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet as fleet
+
+
+class TestFleetMetric(unittest.TestCase):
+    """Test cases for fleet metric."""
+
+    def setUp(self):
+        """Set up, set envs."""
+
+        class FakeFleet:
+            """Fake fleet only for test."""
+
+            def __init__(self):
+                """Init."""
+                self.gloo = fluid.core.Gloo()
+                self.gloo.set_rank(0)
+                self.gloo.set_size(1)
+                self.gloo.set_prefix("123")
+                self.gloo.set_iface("lo")
+                self.gloo.set_hdfs_store("./tmp_test_metric", "", "")
+                self.gloo.init()
+
+            def _all_reduce(self, input, output, mode="sum"):
+                """All reduce using gloo."""
+                input_list = [i for i in input]
+                ans = self.gloo.all_reduce(input_list, mode)
+                for i in range(len(ans)):
+                    output[i] = 1
+
+            def _barrier_worker(self):
+                """Fake barrier worker, do nothing."""
+                pass
+
+        self.fleet = FakeFleet()
+        fleet._role_maker = self.fleet
+
+    def test_metric_1(self):
+        """Test cases for metrics."""
+        train = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train, startup):
+            t = fluid.layers.create_global_var(
+                shape=[1, 1],
+                value=1,
+                dtype='int64',
+                persistable=True,
+                force_cpu=True)
+            t1 = fluid.layers.create_global_var(
+                shape=[1, 1],
+                value=1,
+                dtype='int64',
+                persistable=True,
+                force_cpu=True)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup)
+            metric.sum(t, scope)
+            metric.max(t, scope)
+            metric.min(t, scope)
+            metric.auc(t, t1, scope)
+            metric.mae(t1, 3, scope)
+            metric.rmse(t1, 3, scope)
+            metric.mse(t1, 3, scope)
+            metric.acc(t, t1, scope)
+            metric.sum(str(t.name), scope)
+            metric.max(str(t.name), scope)
+            metric.min(str(t.name), scope)
+            metric.auc(str(t1.name), str(t.name), scope)
+            metric.mae(str(t1.name), 3, scope)
+            metric.rmse(str(t1.name), 3, scope)
+            metric.mse(str(t1.name), 3, scope)
+            metric.acc(str(t.name), str(t1.name), scope)
+        arr = np.array([1, 2, 3, 4])
+        metric.sum(arr)
+        metric.max(arr)
+        metric.min(arr)
+        arr1 = np.array([[1, 2, 3, 4]])
+        arr2 = np.array([[1, 2, 3, 4]])
+        arr3 = np.array([1, 2, 3, 4])
+        metric.auc(arr1, arr2)
+        metric.mae(arr, 3)
+        metric.rmse(arr, 3)
+        metric.mse(arr, 3)
+        metric.acc(arr, arr3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_private_function.py b/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
new file mode 100644
index 0000000000000..ec99acf109816
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_private_function.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import paddle
+import socket
+import threading
+
+
+class TestFleetPrivateFunction(unittest.TestCase):
+    def test_wait_port(self):
+        def init_server(port):
+            import time
+            time.sleep(5)
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.bind(("127.0.0.1", port))
+            sock.listen(10)
+            while True:
+                c, addr = sock.accept()
+                c.send("0")
+                c.close()
+                break
+
+        thr = threading.Thread(target=init_server, args=(9292, ))
+        thr.start()
+
+        import paddle.fleet as fleet
+        ep = ["127.0.0.1:9292"]
+        fleet.base.private_helper_function.wait_server_ready(ep)
+
+        thr.join()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_runtime.py b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
new file mode 100644
index 0000000000000..474e5da1c219c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_runtime.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+
+
+class TestFleetRuntime(unittest.TestCase):
+    def test_fleet_runtime_base(self):
+        import paddle.fleet.runtime
+        base = paddle.fleet.runtime.runtime_base.RuntimeBase()
+        base._run_worker()
+        base._init_server()
+        base._run_server()
+        base._stop_worker()
+
+    def test_fleet_collective_runtime(self):
+        import paddle.fleet.runtime
+        collective_runtime = paddle.fleet.runtime.CollectiveRuntime()
+        collective_runtime._init_worker()
+        collective_runtime._run_worker()
+        collective_runtime._init_worker()
+        collective_runtime._run_server()
+        collective_runtime._stop_worker()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
new file mode 100644
index 0000000000000..4825035d123df
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+
+
+class TestFleetUtil(unittest.TestCase):
+    def test_util_base(self):
+        import paddle.fleet as fleet
+        util = fleet.UtilBase()
+        strategy = fleet.DistributedStrategy()
+        util._set_strategy(strategy)
+        role_maker = None  # should be fleet.PaddleCloudRoleMaker()
+        util._set_role_maker(role_maker)
+
+    def test_util_factory(self):
+        import paddle.fleet as fleet
+        factory = fleet.base.util_factory.UtilFactory()
+        strategy = fleet.DistributedStrategy()
+        role_maker = None  # should be fleet.PaddleCloudRoleMaker()
+        optimize_ops = []
+        params_grads = []
+        util = factory._create_util(strategy, role_maker, optimize_ops,
+                                    params_grads)
+        self.assertEqual(util.role_maker, None)
+
+    def test_get_util(self):
+        import paddle.fleet as fleet
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        default_util = fleet.util
+        self.assertEqual(default_util, None)
+
+    def test_set_user_defined_util(self):
+        import paddle.fleet as fleet
+
+        class UserDefinedUtil(fleet.UtilBase):
+            def __init__(self):
+                super(UserDefinedUtil, self).__init__()
+
+            def get_user_id(self):
+                return 10
+
+        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        my_util = UserDefinedUtil()
+        fleet.util = my_util
+        user_id = fleet.util.get_user_id()
+        self.assertEqual(user_id, 10)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 477b3be76f35f..69fd7d80327f1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -186,7 +186,8 @@ def simple_net_float32(self, is_sparse, dtype):
                                     k - 1]] = out[k]
 
                 self.assertTrue(
-                    np.array_equal(static_loss_value, dy_loss_value))
+                    np.allclose(
+                        static_loss_value, dy_loss_value, rtol=1e-3))
                 for key, value in six.iteritems(static_param_init):
                     self.assertTrue(np.array_equal(value, dy_param_init[key]))
                 for key, value in six.iteritems(static_param_updated):
diff --git a/python/paddle/fluid/tests/unittests/test_index_select_op.py b/python/paddle/fluid/tests/unittests/test_index_select_op.py
index 50d04c1a72378..e551989ed322d 100644
--- a/python/paddle/fluid/tests/unittests/test_index_select_op.py
+++ b/python/paddle/fluid/tests/unittests/test_index_select_op.py
@@ -83,7 +83,7 @@ def test_index_select_api(self):
             x = fluid.layers.data(name='x', shape=[-1, 4])
             index = fluid.layers.data(
                 name='index', shape=[3], dtype='int32', append_batch_size=False)
-            z = paddle.index_select(x, index, dim=1)
+            z = paddle.index_select(x, index, axis=1)
             exe = fluid.Executor(fluid.CPUPlace())
             res, = exe.run(feed={'x': self.data_x,
                                  'index': self.data_index},
@@ -124,7 +124,7 @@ def test_dygraph_api(self):
         with fluid.dygraph.guard():
             x = fluid.dygraph.to_variable(self.data_x)
             index = fluid.dygraph.to_variable(self.data_index)
-            z = paddle.index_select(x, index, dim=1)
+            z = paddle.index_select(x, index, axis=1)
             np_z = z.numpy()
         expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0],
                                [9.0, 10.0, 10.0]])
diff --git a/python/paddle/fluid/tests/unittests/test_io_save_load.py b/python/paddle/fluid/tests/unittests/test_io_save_load.py
index 01665597facb2..c532c1bdbaa05 100644
--- a/python/paddle/fluid/tests/unittests/test_io_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_io_save_load.py
@@ -48,5 +48,26 @@ def test_load_vars_error(self):
                 vars="vars")
 
 
+class TestSaveInferenceModelAPIError(unittest.TestCase):
+    def test_useless_feeded_var_names(self):
+        start_prog = fluid.Program()
+        main_prog = fluid.Program()
+        with fluid.program_guard(main_prog, start_prog):
+            x = fluid.data(name='x', shape=[10, 16], dtype='float32')
+            y = fluid.data(name='y', shape=[10, 16], dtype='float32')
+            z = fluid.layers.fc(x, 4)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(start_prog)
+        with self.assertRaisesRegexp(
+                ValueError, "not involved in the target_vars calculation"):
+            fluid.io.save_inference_model(
+                dirname='./model',
+                feeded_var_names=['x', 'y'],
+                target_vars=[z],
+                executor=exe,
+                main_program=main_prog)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 640e966354b44..abc46034957cf 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -114,8 +114,11 @@ def setUp(self):
     def train_and_save_model(self):
         layer = LinearNet(784, 1)
         example_inputs, layer, _ = train(layer)
+        orig_input_types = [type(x) for x in example_inputs]
         fluid.dygraph.jit.save(
             layer=layer, model_path=self.model_path, input_spec=example_inputs)
+        new_input_types = [type(x) for x in example_inputs]
+        self.assertEqual(orig_input_types, new_input_types)
         return layer
 
     def test_save(self):
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index d9327c9d710ac..803293be9b7d6 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -59,7 +59,8 @@ def test_div(self):
             a = fluid.dygraph.to_variable(a_np)
             b = fluid.dygraph.to_variable(b_np)
             res = a / b
-            self.assertTrue(np.array_equal(res.numpy(), a_np / b_np))
+            #NOTE: Not sure why array_equal fails on windows, allclose is acceptable
+            self.assertTrue(np.allclose(res.numpy(), a_np / b_np))
 
     def test_add_scalar(self):
         a_np = np.random.random(self.shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 4f2466c9b7042..8ca06aa952184 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -175,35 +175,5 @@ def test_check_grad_ingore_y(self):
                 no_grad_set=set('Y'))
 
 
-class TestMulOpAttr(unittest.TestCase):
-    def test_out(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
-            y = fluid.data(name='y', shape=[3, 2], dtype='float32')
-
-            res = fluid.data(name="output", shape=[2, 2], dtype="float32")
-            y_1 = paddle.mul(x, y, out=res)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            data1 = np.array([[1, 2, 3], [4, 5, 6]], dtype='float32')
-            data2 = np.array([[1, 2], [1, 2], [1, 2]], dtype='float32')
-            np_res, np_y_1 = exe.run(feed={'x': data1,
-                                           'y': data2},
-                                     fetch_list=[res, y_1])
-
-            self.assertEqual((np_res == np_y_1).all(), True)
-
-    def test_name(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
-            y = fluid.data(name='y', shape=[3, 2], dtype='float32')
-
-            res = fluid.data(name="output", shape=[2, 2], dtype="float32")
-            y_1 = paddle.mul(x, y, name='mul_res')
-            y_2 = paddle.mul(x, y, out=res, name='mul_res')
-            self.assertEqual(('mul_res' in y_1.name), True)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
new file mode 100644
index 0000000000000..64421f6a1c6a0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.tensor as tensor
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import numpy as np
+import unittest
+
+
+class TestMultiplyAPI(unittest.TestCase):
+    """TestMultiplyAPI."""
+
+    def __run_static_graph_case(self, x_data, y_data, axis=-1):
+        with program_guard(Program(), Program()):
+            x = paddle.nn.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
+            y = paddle.nn.data(name='y', shape=y_data.shape, dtype=y_data.dtype)
+            res = tensor.multiply(x, y, axis=axis)
+
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            outs = exe.run(fluid.default_main_program(),
+                           feed={'x': x_data,
+                                 'y': y_data},
+                           fetch_list=[res])
+            res = outs[0]
+            return res
+
+    def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
+        paddle.enable_imperative()
+        x = paddle.imperative.to_variable(x_data)
+        y = paddle.imperative.to_variable(y_data)
+        res = paddle.multiply(x, y, axis=axis)
+        return res.numpy()
+
+    def test_multiply(self):
+        """test_multiply."""
+        np.random.seed(7)
+        # test static computation graph: 1-d array
+        x_data = np.random.rand(200)
+        y_data = np.random.rand(200)
+        res = self.__run_static_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
+        # test static computation graph: 2-d array
+        x_data = np.random.rand(2, 500)
+        y_data = np.random.rand(2, 500)
+        res = self.__run_static_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
+        # test static computation graph: broadcast
+        x_data = np.random.rand(2, 500)
+        y_data = np.random.rand(500)
+        res = self.__run_static_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
+        # test static computation graph: broadcast with axis
+        x_data = np.random.rand(2, 300, 40)
+        y_data = np.random.rand(300)
+        res = self.__run_static_graph_case(x_data, y_data, axis=1)
+        expected = np.multiply(x_data, y_data[..., np.newaxis])
+        self.assertTrue(np.allclose(res, expected))
+
+        # test dynamic computation graph: 1-d array
+        x_data = np.random.rand(200)
+        y_data = np.random.rand(200)
+        res = self.__run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
+        # test dynamic computation graph: 2-d array
+        x_data = np.random.rand(20, 50)
+        y_data = np.random.rand(20, 50)
+        res = self.__run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
+        # test dynamic computation graph: broadcast
+        x_data = np.random.rand(2, 500)
+        y_data = np.random.rand(500)
+        res = self.__run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
+        # test dynamic computation graph: broadcast with axis
+        x_data = np.random.rand(2, 300, 40)
+        y_data = np.random.rand(300)
+        res = self.__run_dynamic_graph_case(x_data, y_data, axis=1)
+        expected = np.multiply(x_data, y_data[..., np.newaxis])
+        self.assertTrue(np.allclose(res, expected))
+
+
+class TestMultiplyError(unittest.TestCase):
+    """TestMultiplyError."""
+
+    def test_errors(self):
+        """test_errors."""
+        # test static computation graph: dtype can not be int8
+        paddle.disable_imperative()
+        with program_guard(Program(), Program()):
+            x = paddle.nn.data(name='x', shape=[100], dtype=np.int8)
+            y = paddle.nn.data(name='y', shape=[100], dtype=np.int8)
+            self.assertRaises(TypeError, tensor.multiply, x, y)
+
+        # test static computation graph: inputs must be broadcastable 
+        with program_guard(Program(), Program()):
+            x = paddle.nn.data(name='x', shape=[20, 50], dtype=np.float64)
+            y = paddle.nn.data(name='y', shape=[20], dtype=np.float64)
+            self.assertRaises(fluid.core.EnforceNotMet, tensor.multiply, x, y)
+
+        np.random.seed(7)
+        # test dynamic computation graph: dtype can not be int8
+        paddle.enable_imperative()
+        x_data = np.random.randn(200).astype(np.int8)
+        y_data = np.random.randn(200).astype(np.int8)
+        x = paddle.imperative.to_variable(x_data)
+        y = paddle.imperative.to_variable(y_data)
+        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+
+        # test dynamic computation graph: inputs must be broadcastable
+        x_data = np.random.rand(200, 5)
+        y_data = np.random.rand(200)
+        x = paddle.imperative.to_variable(x_data)
+        y = paddle.imperative.to_variable(y_data)
+        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ones_op.py b/python/paddle/fluid/tests/unittests/test_ones_op.py
index 6061bfcff442e..94a23b32aa8ea 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_op.py
@@ -26,27 +26,36 @@
 
 
 class ApiOnesTest(unittest.TestCase):
-    def test_out(self):
-        with fluid.program_guard(fluid.Program()):
+    def test_paddle_ones(self):
+        with paddle.program_guard(paddle.Program()):
             ones = paddle.ones(shape=[10], dtype="float64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="float64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with fluid.program_guard(fluid.Program()):
+        with paddle.program_guard(paddle.Program()):
+            ones = paddle.ones(shape=[10], dtype="float64")
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
+            result, = exe.run(fetch_list=[ones])
+            expected_result = np.ones(10, dtype="float64")
+        self.assertEqual((result == expected_result).all(), True)
+
+        with paddle.program_guard(paddle.Program()):
             ones = paddle.ones(shape=[10], dtype="int64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with fluid.program_guard(fluid.Program()):
-            ones = paddle.ones(shape=[10], dtype="int64", device="cpu")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
+    def test_fluid_ones(self):
+        with paddle.program_guard(paddle.Program()):
+            ones = fluid.layers.ones(shape=[10], dtype="int64")
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
             result, = exe.run(fetch_list=[ones])
             expected_result = np.ones(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
@@ -55,25 +64,25 @@ def test_out(self):
 class ApiOnesZerosError(unittest.TestCase):
     def test_errors(self):
         def test_error1():
-            with fluid.program_guard(fluid.Program()):
-                ones = paddle.ones(shape=10, dtype="int64", device="opu")
+            with paddle.program_guard(paddle.Program()):
+                ones = paddle.ones(shape=10, dtype="int64")
 
-        self.assertRaises(ValueError, test_error1)
+        self.assertRaises(TypeError, test_error1)
 
         def test_error2():
-            with fluid.program_guard(fluid.Program()):
-                ones = paddle.ones(shape=10, dtype="int64", device="opu")
+            with paddle.program_guard(paddle.Program()):
+                ones = paddle.ones(shape=10)
 
-        self.assertRaises(ValueError, test_error2)
+        self.assertRaises(TypeError, test_error2)
 
         def test_error3():
-            with fluid.program_guard(fluid.Program()):
+            with paddle.program_guard(paddle.Program()):
                 ones = fluid.layers.ones(shape=10, dtype="int64")
 
         self.assertRaises(TypeError, test_error3)
 
         def test_error4():
-            with fluid.program_guard(fluid.Program()):
+            with paddle.program_guard(paddle.Program()):
                 ones = fluid.layers.ones(shape=[10], dtype="int8")
 
         self.assertRaises(TypeError, test_error4)
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 7894fc018876c..3384c80499b63 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -948,5 +948,82 @@ def mlp(input_x, input_y):
                 self.assertEqual(drop_vec[0].tolist(), drop_vec[1].tolist())
 
 
+class TestGradientMergeOptimizer(unittest.TestCase):
+    def net(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        b1 = block.create_parameter(
+            dtype="float32", shape=[5, 8], lod_level=0, name="b1")
+        b1_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="b1_out")
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        block.append_op(
+            type="elementwise_add",
+            inputs={"X": mul_out,
+                    "Y": b1},
+            outputs={"Out": b1_out})
+        block.append_op(
+            type="mean", inputs={"X": b1_out}, outputs={"Out": mean_out})
+        return mean_out
+
+    def test_program_desc(self, ):
+        cost = self.net()
+        main_program = cost.block.program
+        init_program = framework.Program()
+        self.assertEqual(main_program.num_blocks, 1)
+        self.assertEqual(len(cost.block.ops), 3)
+        self.assertEqual([op.type for op in cost.block.ops],
+                         ["mul", "elementwise_add", "mean"])
+
+        opt = optimizer.SGD(learning_rate=1.0)
+        opt = optimizer.GradientMergeOptimizer(opt, k_steps=4)
+        with framework.program_guard(main_program, init_program):
+            ops, params_grads = opt.minimize(cost)
+
+        self.assertEqual(main_program.num_blocks, 4)
+
+        # main block
+        self.assertEqual(len(cost.block.ops), 17)
+        self.assertEqual([op.type for op in cost.block.ops], [
+            'mul', 'elementwise_add', 'mean', 'fill_constant', 'mean_grad',
+            'elementwise_add_grad', 'mul_grad', 'increment', 'fill_constant',
+            'fill_constant', 'elementwise_mod', 'cast', 'not_equal',
+            'logical_not', 'conditional_block', 'conditional_block',
+            'conditional_block_grad'
+        ])
+
+        # merge block
+        self.assertEqual(len(main_program.block(1).ops), 2)
+        self.assertEqual([op.type for op in main_program.block(1).ops], [
+            'elementwise_add',
+            'elementwise_add',
+        ])
+
+        # reset block
+        self.assertEqual(len(main_program.block(2).ops), 6)
+        self.assertEqual([op.type for op in main_program.block(2).ops], [
+            'elementwise_add', 'scale', 'elementwise_add', 'scale',
+            'fill_constant', 'fill_constant'
+        ])
+
+        # optimize block
+        self.assertEqual(len(main_program.block(3).ops), 2)
+        self.assertEqual([op.type for op in main_program.block(3).ops],
+                         ['sgd', 'sgd'])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
index 62eb7e1155e6f..57ff4890f6a13 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
@@ -44,7 +44,7 @@ def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5):
         for loss in zip(all_reduce_first_loss, reduce_first_loss):
             self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
         for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+            self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2)
 
         if not use_cuda:
             return
@@ -72,17 +72,17 @@ def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5):
         for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq):
             self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
         for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+            self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2)
 
         for loss in zip(reduce_first_loss, reduce_first_loss_seq):
             self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
         for loss in zip(reduce_last_loss, reduce_last_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+            self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2)
 
         for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq):
             self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
         for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+            self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2)
 
 
 class TestResnetWithReduceCPU(TestResnetWithReduceBase):
diff --git a/python/paddle/fluid/tests/unittests/test_program_code.py b/python/paddle/fluid/tests/unittests/test_program_code.py
index 036007c6accd4..76ff3f37bf006 100644
--- a/python/paddle/fluid/tests/unittests/test_program_code.py
+++ b/python/paddle/fluid/tests/unittests/test_program_code.py
@@ -15,6 +15,7 @@
 import os
 import time
 import unittest
+import sys
 from multiprocessing import Process
 import signal
 
@@ -29,6 +30,8 @@
 
 
 class TestProgram2Code(unittest.TestCase):
+    @unittest.skipIf(sys.platform == "win32",
+                     "Windows does not support distribution")
     def test_print(self):
         place = fluid.CPUPlace()
         self.init_serv(place)
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 40c9480a2c995..5b2d5be346a9b 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -17,12 +17,9 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
 import paddle
+from paddle.fluid import core
+from paddle import Program, program_guard
 
 
 def output_hist(out):
@@ -56,25 +53,11 @@ def verify_output(self, outs):
 
 class TestRandintOpError(unittest.TestCase):
     def test_errors(self):
-        main_prog = Program()
-        start_prog = Program()
-        with program_guard(main_prog, start_prog):
-
-            def test_shape():
-                shape = np.array([2, 3])
-                paddle.randint(5, shape=shape, dtype='int32')
-
-            self.assertRaises(TypeError, test_shape)
-
-            def test_dtype():
-                paddle.randint(5, shape=[32, 32], dtype='float32')
-
-            self.assertRaises(TypeError, test_dtype)
-
-            def test_low_high():
-                paddle.randint(low=5, high=5, shape=[32, 32], dtype='int32')
-
-            self.assertRaises(ValueError, test_low_high)
+        with program_guard(Program(), Program()):
+            self.assertRaises(TypeError, paddle.randint, 5, shape=np.array([2]))
+            self.assertRaises(TypeError, paddle.randint, 5, dtype='float32')
+            self.assertRaises(ValueError, paddle.randint, 5, 5)
+            self.assertRaises(ValueError, paddle.randint, -5)
 
 
 class TestRandintOp_attr_tensorlist(OpTest):
@@ -127,46 +110,44 @@ def verify_output(self, outs):
 # Test python API
 class TestRandintAPI(unittest.TestCase):
     def test_api(self):
-        startup_program = fluid.Program()
-        train_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
+        with program_guard(Program(), Program()):
             # results are from [0, 5).
-            output1 = paddle.randint(5)
+            out1 = paddle.randint(5)
             # shape is a list and dtype is 'int32'
-            output2 = paddle.randint(
+            out2 = paddle.randint(
                 low=-100, high=100, shape=[64, 64], dtype='int32')
             # shape is a tuple and dtype is 'int64'
-            output3 = paddle.randint(
+            out3 = paddle.randint(
                 low=-100, high=100, shape=(32, 32, 3), dtype='int64')
             # shape is a tensorlist and dtype is 'float32'
-            dim_1 = fluid.layers.fill_constant([1], "int64", 32)
-            dim_2 = fluid.layers.fill_constant([1], "int32", 50)
-            output4 = paddle.randint(
-                low=-100, high=100, shape=[dim_1, 5], dtype='int32')
+            dim_1 = paddle.fill_constant([1], "int64", 32)
+            dim_2 = paddle.fill_constant([1], "int32", 50)
+            out4 = paddle.randint(
+                low=-100, high=100, shape=[dim_1, 5, dim_2], dtype='int32')
             # shape is a tensor and dtype is 'float64'
-            var_shape = fluid.data(name='var_shape', shape=[2], dtype="int64")
-            output5 = paddle.randint(
+            var_shape = paddle.nn.data(
+                name='var_shape', shape=[2], dtype="int64")
+            out5 = paddle.randint(
                 low=1, high=1000, shape=var_shape, dtype='int64')
 
-            place = fluid.CPUPlace()
-            if fluid.core.is_compiled_with_cuda():
-                place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-
-            exe.run(startup_program)
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            exe = paddle.Executor(place)
             outs = exe.run(
-                train_program,
                 feed={'var_shape': np.array([100, 100]).astype('int64')},
-                fetch_list=[output1, output2, output3, output4, output5])
+                fetch_list=[out1, out2, out3, out4, out5])
 
 
-class TestRandintDygraphMode(unittest.TestCase):
-    def test_check_output(self):
-        with fluid.dygraph.guard():
-            x = paddle.randint(10, shape=[10], dtype="int32")
-            x_np = x.numpy()
-            for i in range(10):
-                self.assertTrue((x_np[i] >= 0 and x_np[i] < 10))
+class TestRandintImperative(unittest.TestCase):
+    def test_api(self):
+        n = 10
+        with paddle.imperative.guard():
+            x1 = paddle.randint(n, shape=[10], dtype="int32")
+            x2 = paddle.tensor.randint(n)
+            x3 = paddle.tensor.random.randint(n)
+            for i in [x1, x2, x3]:
+                for j in i.numpy().tolist():
+                    self.assertTrue((j >= 0 and j < n))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
new file mode 100644
index 0000000000000..bc50cf197f63e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import unittest
+
+paddle.enable_imperative()
+SEED = 2020
+np.random.seed(SEED)
+fluid.default_main_program().random_seed = SEED
+
+
+class Generator(fluid.dygraph.Layer):
+    def __init__(self):
+        super(Generator, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(3, 3, 3, 1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = fluid.layers.tanh(x)
+        return x
+
+
+class Discriminator(fluid.dygraph.Layer):
+    def __init__(self):
+        super(Discriminator, self).__init__()
+        self.convd = paddle.nn.Conv2D(6, 3, 1)
+
+    def forward(self, x):
+        x = self.convd(x)
+        return x
+
+
+class TestRetainGraph(unittest.TestCase):
+    def cal_gradient_penalty(self,
+                             netD,
+                             real_data,
+                             fake_data,
+                             edge_data=None,
+                             type='mixed',
+                             constant=1.0,
+                             lambda_gp=10.0):
+        if lambda_gp > 0.0:
+            if type == 'real':
+                interpolatesv = real_data
+            elif type == 'fake':
+                interpolatesv = fake_data
+            elif type == 'mixed':
+                alpha = paddle.rand((real_data.shape[0], 1))
+                alpha = paddle.expand(
+                    alpha, [1, np.prod(real_data.shape) // real_data.shape[0]])
+                alpha = paddle.reshape(alpha, real_data.shape)
+                interpolatesv = alpha * real_data + ((1 - alpha) * fake_data)
+            else:
+                raise NotImplementedError('{} not implemented'.format(type))
+            interpolatesv.stop_gradient = False
+            real_data.stop_gradient = True
+            fake_AB = paddle.concat((real_data.detach(), interpolatesv), 1)
+            disc_interpolates = netD(fake_AB)
+
+            outs = paddle.fill_constant(disc_interpolates.shape,
+                                        disc_interpolates.dtype, 1.0)
+            gradients = paddle.imperative.grad(
+                outputs=disc_interpolates,
+                inputs=fake_AB,
+                grad_outputs=outs,
+                create_graph=True,
+                retain_graph=True,
+                only_inputs=True)
+
+            gradients = paddle.reshape(gradients[0], [real_data.shape[0], -1])
+
+            gradient_penalty = paddle.reduce_mean((paddle.norm(
+                gradients + 1e-16, 2, 1) - constant)**
+                                                  2) * lambda_gp  # added eps
+            return gradient_penalty, gradients
+        else:
+            return 0.0, None
+
+    def test_retain(self):
+        g = Generator()
+        d = Discriminator()
+
+        optim_g = paddle.optimizer.Adam(parameter_list=g.parameters())
+        optim_d = paddle.optimizer.Adam(parameter_list=d.parameters())
+
+        gan_criterion = paddle.nn.MSELoss()
+        l1_criterion = paddle.nn.L1Loss()
+
+        A = np.random.rand(2, 3, 32, 32).astype('float32')
+        B = np.random.rand(2, 3, 32, 32).astype('float32')
+
+        realA = paddle.imperative.to_variable(A)
+        realB = paddle.imperative.to_variable(B)
+        fakeB = g(realA)
+
+        optim_d.clear_gradients()
+        fake_AB = paddle.concat((realA, fakeB), 1)
+        G_pred_fake = d(fake_AB.detach())
+
+        false_target = paddle.fill_constant(G_pred_fake.shape, 'float32', 0.0)
+
+        G_gradient_penalty, _ = self.cal_gradient_penalty(
+            d, realA, fakeB, lambda_gp=10.0)
+        loss_d = gan_criterion(G_pred_fake, false_target) + G_gradient_penalty
+
+        loss_d.backward(retain_graph=True)
+        optim_d.minimize(loss_d)
+
+        optim_g.clear_gradients()
+        fake_AB = paddle.concat((realA, fakeB), 1)
+        G_pred_fake = d(fake_AB)
+        true_target = paddle.fill_constant(G_pred_fake.shape, 'float32', 1.0)
+        loss_g = l1_criterion(fakeB, realB) + gan_criterion(G_pred_fake,
+                                                            true_target)
+
+        loss_g.backward()
+        optim_g.minimize(loss_g)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index c00ad536656d9..1df50d63e3f67 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -181,8 +181,7 @@ def test_check_grad(self):
         pass
 
 
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
+@unittest.skip('disable TestSoftmaxFP16Op2')
 class TestSoftmaxFP16Op2(TestSoftmaxOp):
     def init_kernel_type(self):
         self.dtype = np.float16
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index 75f474052cc94..5ab13cec540aa 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -70,6 +70,14 @@ def init_test_case(self):
         self.new_shape = (6, 5, 1, 4)
 
 
+# Correct: The demension of axis is not of size 1 remains unchanged.
+class TestSqueezeOp4(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, 2)
+        self.new_shape = (6, 5, 1, 4, 1)
+
+
 class TestSqueezeOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -90,7 +98,7 @@ def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data1 = fluid.layers.data(
                 'data1', shape=[-1, 1, 10], dtype='float64')
-            result_squeeze = paddle.squeeze(data1, axes=[1])
+            result_squeeze = paddle.squeeze(data1, axis=[1])
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             input1 = np.random.random([5, 1, 10]).astype('float64')
@@ -105,7 +113,25 @@ def test_out(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([5, 1, 10]).astype("int32")
             input = fluid.dygraph.to_variable(input_1)
-            output = paddle.squeeze(input, axes=[1])
+            output = paddle.squeeze(input, axis=[1])
+            out_np = output.numpy()
+            expected_out = np.squeeze(input_1, axis=1)
+            self.assertTrue(np.allclose(expected_out, out_np))
+
+    def test_axis_not_list(self):
+        with fluid.dygraph.guard():
+            input_1 = np.random.random([5, 1, 10]).astype("int32")
+            input = fluid.dygraph.to_variable(input_1)
+            output = paddle.squeeze(input, axis=1)
+            out_np = output.numpy()
+            expected_out = np.squeeze(input_1, axis=1)
+            self.assertTrue(np.allclose(expected_out, out_np))
+
+    def test_dimension_not_1(self):
+        with fluid.dygraph.guard():
+            input_1 = np.random.random([5, 1, 10]).astype("int32")
+            input = fluid.dygraph.to_variable(input_1)
+            output = paddle.squeeze(input, axis=(1, 2))
             out_np = output.numpy()
             expected_out = np.squeeze(input_1, axis=1)
             self.assertTrue(np.allclose(expected_out, out_np))
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_op.py b/python/paddle/fluid/tests/unittests/test_zeros_op.py
index b7f7d93418342..0cf51a87cf6b8 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_op.py
@@ -36,26 +36,43 @@ def test_errors(self):
 
 class ApiZerosTest(unittest.TestCase):
     def test_out(self):
-        with paddle.program_guard(fluid.Program()):
+        with program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype="float64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="float64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(fluid.Program()):
+        with paddle.program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype="int64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
 
-        with paddle.program_guard(fluid.Program()):
+        with program_guard(Program()):
             zeros = paddle.zeros(shape=[10], dtype="int64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
+            result, = exe.run(fetch_list=[zeros])
+            expected_result = np.zeros(10, dtype="int64")
+        self.assertEqual((result == expected_result).all(), True)
+
+        with program_guard(Program()):
+            out_np = np.zeros(shape=(1), dtype='float32')
+            out = paddle.zeros(shape=[1], dtype="float32")
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
+            result = exe.run(fetch_list=[out])
+            self.assertEqual((result == out_np).all(), True)
+
+    def test_fluid_out(self):
+        with program_guard(Program()):
+            zeros = fluid.layers.zeros(shape=[10], dtype="int64")
+            place = paddle.CPUPlace()
+            exe = paddle.Executor(place)
             result, = exe.run(fetch_list=[zeros])
             expected_result = np.zeros(10, dtype="int64")
         self.assertEqual((result == expected_result).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
index db5ad92ff5ead..4629089e39c94 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
@@ -74,7 +74,8 @@
     'transpose2', \
     'trilinear_interp', \
     'var_conv_2d', \
-    'warpctc'
+    'warpctc', \
+    'bilateral_slice'
 ]
 
 NO_FP16_CHECK_GRAD_OP_LIST = [
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index fd3d5f3104f82..ce6868b5c70ae 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -40,7 +40,8 @@
     'teacher_student_sigmoid_loss', \
     'unpool', \
     'yolov3_loss', \
-    'inverse'
+    'inverse', \
+    'bilateral_slice'
 ]
 
 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp']
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index ffef35b7acc27..478e05c8975d0 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -49,6 +49,8 @@ def __init__(self):
         self._infer = False
 
     def _set_fetch_var_and_info(self, fetch_vars, fetch_info, print_period):
+        # convert fetch_info to list
+        fetch_info = list(fetch_info)
         for i, v in enumerate(fetch_vars):
             self.proto_desc.fetch_config.fetch_var_names.extend([v.name])
             self.proto_desc.fetch_config.fetch_var_str_format.extend(
diff --git a/python/paddle/incubate/hapi/__init__.py b/python/paddle/incubate/hapi/__init__.py
index 30a2b4ffcbd8a..235460ae76a62 100644
--- a/python/paddle/incubate/hapi/__init__.py
+++ b/python/paddle/incubate/hapi/__init__.py
@@ -16,7 +16,10 @@
 from . import progressbar
 from . import callbacks
 from . import download
+
 from . import model
+from .model import *
+
 from . import metrics
 from . import loss
 from . import datasets
@@ -24,6 +27,11 @@
 from . import vision
 from . import text
 
+from . import device
+from .device import *
+
+from .dygraph_layer_patch import monkey_patch_layer
+
 logger.setup_logger()
 
 __all__ = [
@@ -35,6 +43,6 @@
     'loss',
     'vision',
     'text',
-]
+] + model.__all__ + device.__all__
 
-__all__ += model.__all__
+monkey_patch_layer()
diff --git a/python/paddle/incubate/hapi/callbacks.py b/python/paddle/incubate/hapi/callbacks.py
index 7b3c41584151c..747108e257238 100644
--- a/python/paddle/incubate/hapi/callbacks.py
+++ b/python/paddle/incubate/hapi/callbacks.py
@@ -291,30 +291,22 @@ class ProgBarLogger(Callback):
     Examples:
         .. code-block:: python
 
-            import numpy as np
-            from paddle import fluid
-            from paddle.incubate.hapi.metrics import Accuracy
-            from paddle.incubate.hapi.loss import CrossEntropy
-            from paddle.incubate.hapi.datasets import MNIST
-            from paddle.incubate.hapi.vision.models import LeNet
-            from paddle.incubate.hapi.callbacks import ProgBarLogger
-            from paddle.incubate.hapi.model import Input, set_device
+            import paddle.fluid as fluid
+            import paddle.incubate.hapi as hapi
 
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
+            inputs = [hapi.Input('image', [-1, 1, 28, 28], 'float32')]
+            labels = [hapi.Input('label', [None, 1], 'int64')]
 
-            train_dataset = MNIST(mode='train')
+            train_dataset = hapi.datasets.MNIST(mode='train')
 
-            model = LeNet()
+            model = hapi.Model(hapi.vision.LeNet(), inputs, labels)
 
             optim = fluid.optimizer.Adam(0.001)
-            model.prepare(optimizer=optim, 
-                        loss_function=CrossEntropy(), 
-                        metrics=Accuracy(), 
-                        inputs=inputs, 
-                        labels=labels)
+            model.prepare(optimizer=optim,
+                        loss_function=hapi.loss.CrossEntropy(),
+                        metrics=hapi.metrics.Accuracy())
 
-            callback = ProgBarLogger(log_freq=10)
+            callback = hapi.callbacks.ProgBarLogger(log_freq=10)
             model.fit(train_dataset, batch_size=64, callbacks=callback)
     """
 
@@ -433,31 +425,22 @@ class ModelCheckpoint(Callback):
     Examples:
         .. code-block:: python
 
-            import numpy as np
-            from paddle import fluid
-            from paddle.incubate.hapi.metrics import Accuracy
-            from paddle.incubate.hapi.loss import CrossEntropy
-            from paddle.incubate.hapi.datasets import MNIST
-            
-            from paddle.incubate.hapi.vision.models import LeNet
-            from paddle.incubate.hapi.callbacks import ModelCheckpoint
-            from paddle.incubate.hapi.model import Input, set_device
+            import paddle.fluid as fluid
+            import paddle.incubate.hapi as hapi
 
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
+            inputs = [hapi.Input('image', [-1, 1, 28, 28], 'float32')]
+            labels = [hapi.Input('label', [None, 1], 'int64')]
 
-            train_dataset = MNIST(mode='train')
+            train_dataset = hapi.datasets.MNIST(mode='train')
 
-            model = LeNet()
+            model = hapi.Model(hapi.vision.LeNet(), inputs, labels)
 
             optim = fluid.optimizer.Adam(0.001)
-            model.prepare(optimizer=optim, 
-                        loss_function=CrossEntropy(), 
-                        metrics=Accuracy(), 
-                        inputs=inputs, 
-                        labels=labels)
+            model.prepare(optimizer=optim,
+                        loss_function=hapi.loss.CrossEntropy(),
+                        metrics=hapi.metrics.Accuracy())
 
-            callback = ModelCheckpoint(save_dir='./temp')
+            callback = hapi.callbacks.ModelCheckpoint(save_dir='./temp')
             model.fit(train_dataset, batch_size=64, callbacks=callback)
     """
 
diff --git a/python/paddle/incubate/hapi/device.py b/python/paddle/incubate/hapi/device.py
new file mode 100644
index 0000000000000..3ff29822f6f45
--- /dev/null
+++ b/python/paddle/incubate/hapi/device.py
@@ -0,0 +1,66 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+__all__ = ['set_device', ]
+
+# TODO(qingqing01): remove or refine _global_device, set_device and get_device
+# after core framework supporting these function.
+_global_device = None
+
+
+def set_device(device):
+    """
+    Args:
+        device (str): specify device type, 'cpu' or 'gpu'.
+        
+    Returns:
+        fluid.CUDAPlace or fluid.CPUPlace: Created GPU or CPU place.
+
+    Examples:
+        .. code-block:: python
+
+        import paddle.incubate.hapi as hapi
+
+        input = hapi.set_device('gpu')
+    """
+
+    assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
+    "Expected device in ['cpu', 'gpu'], but got {}".format(device)
+
+    device = fluid.CUDAPlace(ParallelEnv().dev_id) \
+            if device.lower() == 'gpu' and fluid.is_compiled_with_cuda() \
+                else fluid.CPUPlace()
+
+    global _global_device
+    _global_device = device
+    return device
+
+
+def _get_device():
+    """
+    Return global device.
+    """
+    if _global_device is not None:
+        device = _global_device
+    else:
+        if fluid.is_compiled_with_cuda():
+            device = fluid.CUDAPlace(ParallelEnv().dev_id)
+        else:
+            device = fluid.CPUPlace()
+    return device
diff --git a/python/paddle/incubate/hapi/dygraph_layer_patch.py b/python/paddle/incubate/hapi/dygraph_layer_patch.py
new file mode 100644
index 0000000000000..80a3d82fc87cf
--- /dev/null
+++ b/python/paddle/incubate/hapi/dygraph_layer_patch.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import paddle.fluid as fluid
+from paddle.fluid.framework import in_dygraph_mode
+
+from .device import _get_device
+
+
+def monkey_patch_layer():
+    def load_dict(self,
+                  stat_dict,
+                  include_sublayers=True,
+                  use_structured_name=True):
+        '''
+        Set parameters from stat_dict. All the parameters will be reset by the
+        tensor in the stat_dict
+
+        This api will be Deprecated. Please use set_dict
+
+        Parameters:
+            state_dict(dict) : Dict contains all the parameters
+            include_sublayers(bool, optional) : If true, also include the
+                parameters from sublayers. Default: True
+            use_structured_name(bool, optional) : If true, use structured name
+                as key, otherwise, use parameter name as key. Default: True
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.fluid as fluid
+                with fluid.dygraph.guard():
+                    emb = fluid.dygraph.Embedding([10, 10])
+
+                    state_dict = emb.state_dict()
+                    fluid.save_dygraph( state_dict, "paddle_dy")
+                    
+                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
+                    emb.load_dict( para_state_dict )
+
+        '''
+
+        def _check_match(key, param):
+            state = stat_dict.get(key, None)
+            if state is None:
+                raise ValueError(
+                    "{} is not found in the providing file.".format(key))
+            if list(state.shape) != list(param.shape):
+                raise ValueError(
+                    "{} receives a shape {}, but the expected shape is {}.".
+                    format(key, list(state.shape), list(param.shape)))
+            return param, state
+
+        matched_param_state = []
+        for key, param in self.state_dict().items():
+            key_name = key if use_structured_name else param.name
+            try:
+                match_res = _check_match(key_name, param)
+            except ValueError as err:
+                warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
+            matched_param_state.append(match_res)
+
+        if in_dygraph_mode():
+            for param, state in matched_param_state:
+                param.set_value(state)
+        else:
+
+            def _set_var(var, ndarray):
+                t = fluid.global_scope().find_var(var.name).get_tensor()
+                p = t._place()
+                if p.is_cpu_place():
+                    place = fluid.CPUPlace()
+                elif p.is_cuda_pinned_place():
+                    place = fluid.CUDAPinnedPlace()
+                else:
+                    p = fluid.core.Place()
+                    p.set_place(t._place())
+                    place = fluid.CUDAPlace(p.gpu_device_id())
+                t.set(ndarray, place)
+
+            executor = fluid.Executor(_get_device())._default_executor
+            # restore parameter states
+            fluid.core._create_loaded_parameter(
+                [param for param, state in matched_param_state],
+                fluid.global_scope(), executor)
+            for param, state in matched_param_state:
+                _set_var(param, state)
+
+    setattr(fluid.dygraph.Layer, 'load_dict', load_dict)
diff --git a/python/paddle/incubate/hapi/loss.py b/python/paddle/incubate/hapi/loss.py
index 8f2e28477953d..72c8488d596e7 100644
--- a/python/paddle/incubate/hapi/loss.py
+++ b/python/paddle/incubate/hapi/loss.py
@@ -86,16 +86,13 @@ class CrossEntropy(Loss):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.model import Input
-            from paddle.incubate.hapi.vision.models import LeNet
-            from paddle.incubate.hapi.loss import CrossEntropy
+            import paddle.fluid as fluid
+            import paddle.incubate.hapi as hapi
 
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
+            fluid.enable_dygraph()
 
-            model = LeNet()
-            loss = CrossEntropy()
-            model.prepare(loss_function=loss, inputs=inputs, labels=labels)
+            model = hapi.Model(hapi.vision.LeNet())
+            model.prepare(loss_function=hapi.loss.CrossEntropy())
             
     """
 
@@ -123,16 +120,14 @@ class SoftmaxWithCrossEntropy(Loss):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.model import Input
-            from paddle.incubate.hapi.vision.models import LeNet
-            from paddle.incubate.hapi.loss import SoftmaxWithCrossEntropy
+            import paddle.fluid as fluid
+            import paddle.incubate.hapi as hapi
 
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
+            fluid.enable_dygraph()
 
-            model = LeNet(classifier_activation=None)
-            loss = SoftmaxWithCrossEntropy()
-            model.prepare(loss_function=loss, inputs=inputs, labels=labels)
+            model = hapi.Model(hapi.vision.LeNet(classifier_activation=None))
+            loss = hapi.loss.SoftmaxWithCrossEntropy()
+            model.prepare(loss_function=loss)
     """
 
     def __init__(self, average=True):
diff --git a/python/paddle/incubate/hapi/metrics.py b/python/paddle/incubate/hapi/metrics.py
index f26b47b257b4b..3b630f50b246d 100644
--- a/python/paddle/incubate/hapi/metrics.py
+++ b/python/paddle/incubate/hapi/metrics.py
@@ -170,30 +170,20 @@ class Accuracy(Metric):
         
         .. code-block:: python
 
-        from paddle import fluid
-        from paddle.incubate.hapi.metrics import Accuracy
-        from paddle.incubate.hapi.loss import CrossEntropy
-        from paddle.incubate.hapi.datasets import MNIST
-        from paddle.incubate.hapi.model import Input
-        from paddle.incubate.hapi.vision.models import LeNet 
+        import paddle.fluid as fluid
+        import paddle.incubate.hapi as hapi
 
         fluid.enable_dygraph()
 
-        train_dataset = MNIST(mode='train')
+        train_dataset = hapi.datasets.MNIST(mode='train')
 
-        model = LeNet()
+        model = hapi.Model(hapi.vision.LeNet())
         optim = fluid.optimizer.Adam(
             learning_rate=0.001, parameter_list=model.parameters())
-
-        inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-        labels = [Input([None, 1], 'int64', name='label')]
-            
         model.prepare(
             optim,
-            loss_function=CrossEntropy(average=False),
-            metrics=Accuracy(),
-            inputs=inputs,
-            labels=labels)
+            loss_function=hapi.loss.CrossEntropy(average=False),
+            metrics=hapi.metrics.Accuracy())
 
         model.fit(train_dataset, batch_size=64)
 
diff --git a/python/paddle/incubate/hapi/model.py b/python/paddle/incubate/hapi/model.py
index f8b928397c8dc..c94cddd826f5f 100644
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/incubate/hapi/model.py
@@ -39,36 +39,40 @@
 from .distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
 from .metrics import Metric
 from .callbacks import config_callbacks
-from .utils import to_list, to_numpy, flatten_list, restore_flatten_list
+from .utils import to_list, to_numpy, flatten_list, restore_flatten_list, extract_args
+from .device import _get_device
 
 __all__ = [
     'Model',
     'Input',
-    'set_device',
 ]
 
 
-def set_device(device):
-    """
-    Args:
-        device (str): specify device type, 'cpu' or 'gpu'.
-        
-    Returns:
-        fluid.CUDAPlace or fluid.CPUPlace: Created GPU or CPU place.
+class Input(fluid.dygraph.Layer):
     """
+    Define inputs the model.
 
-    assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
-    "Expected device in ['cpu', 'gpu'], but got {}".format(device)
-
-    place = fluid.CUDAPlace(ParallelEnv().dev_id) \
-            if device.lower() == 'gpu' and fluid.is_compiled_with_cuda() \
-                else fluid.CPUPlace()
+    Args:
+        name (str): The name/alias of the variable, see :ref:`api_guide_Name`
+            for more details.
+        shape (tuple(integers)|list[integers]): List|Tuple of integers
+            declaring the shape. You can set "None" or -1 at a dimension
+            to indicate the dimension can be of any size. For example,
+            it is useful to set changeable batch size as "None" or -1.
+        dtype (np.dtype|VarType|str, optional): The type of the data. Supported
+            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
+            uint8. Default: float32.
+
+    Examples:
+        .. code-block:: python
 
-    return place
+        import paddle.incubate.hapi as hapi
 
+        input = hapi.Input('x', [None, 784], 'float32')
+        label = hapi.Input('label', [None, 1], 'int64')
+    """
 
-class Input(fluid.dygraph.Layer):
-    def __init__(self, shape=None, dtype=None, name=None):
+    def __init__(self, name, shape=None, dtype='float32'):
         super(Input, self).__init__()
         self.shape = shape
         self.dtype = dtype
@@ -132,7 +136,7 @@ def test_batch(self, inputs):
         return self._run(inputs, None)
 
     def parameters(self, *args, **kwargs):
-        return super(Model, self.model).parameters(*args, **kwargs)
+        return self.model.network.parameters(*args, **kwargs)
 
     def save(self, path):
         def _save(state, path):
@@ -151,7 +155,7 @@ def _save(state, path):
         if dir_name and not os.path.exists(dir_name):
             os.makedirs(dir_name)
         param_path = path + ".pdparams"
-        _save(self.model.state_dict(), param_path)
+        _save(self.model.network.state_dict(), param_path)
         prog = self._progs.get('train', None)
         if prog is None or self.model._optimizer is None:
             return
@@ -384,7 +388,7 @@ def _make_program(self, mode):
             inputs = [k.forward() for k in to_list(ins)]
             labels = [k.forward() for k in to_list(lbls)]
             self._label_vars[mode] = labels
-            outputs = to_list(self.model.forward(*inputs))
+            outputs = to_list(self.model.network.forward(*inputs))
 
             if mode != 'test' and self.model._loss_function:
                 losses = self.model._loss_function(outputs, labels)
@@ -479,8 +483,8 @@ def __init__(self, model):
             stradegy.local_rank = ParallelEnv().local_rank
             stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
             stradegy.current_endpoint = ParallelEnv().current_endpoint
-            self.ddp_model = fluid.dygraph.parallel.DataParallel(self.model,
-                                                                 stradegy)
+            self.ddp_model = fluid.dygraph.parallel.DataParallel(
+                self.model.network, stradegy)
 
     @property
     def mode(self):
@@ -494,7 +498,7 @@ def mode(self, value):
     def train_batch(self, inputs, labels=None):
         assert self.model._optimizer, \
             "model not ready, please call `model.prepare()` first"
-        super(Model, self.model).train()
+        self.model.network.train()
         self.mode = 'train'
         inputs = to_list(inputs)
         if labels is not None:
@@ -507,13 +511,14 @@ def train_batch(self, inputs, labels=None):
             final_loss.backward()
             self.ddp_model.apply_collective_grads()
         else:
-            outputs = self.model.forward(* [to_variable(x) for x in inputs])
+            outputs = self.model.network.forward(
+                * [to_variable(x) for x in inputs])
             losses = self.model._loss_function(outputs, labels)
             final_loss = fluid.layers.sum(losses)
             final_loss.backward()
 
         self.model._optimizer.minimize(final_loss)
-        self.model.clear_gradients()
+        self.model.network.clear_gradients()
         metrics = []
         for metric in self.model._metrics:
             metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list(
@@ -525,12 +530,12 @@ def train_batch(self, inputs, labels=None):
             if len(metrics) > 0 else [to_numpy(l) for l in losses]
 
     def eval_batch(self, inputs, labels=None):
-        super(Model, self.model).eval()
+        self.model.network.eval()
         self.mode = 'eval'
         inputs = to_list(inputs)
         if labels is not None:
             labels = [to_variable(l) for l in to_list(labels)]
-        outputs = self.model.forward(* [to_variable(x) for x in inputs])
+        outputs = self.model.network.forward(* [to_variable(x) for x in inputs])
         if self.model._loss_function:
             losses = self.model._loss_function(outputs, labels)
         else:
@@ -571,20 +576,20 @@ def eval_batch(self, inputs, labels=None):
             if len(metrics) > 0 else [to_numpy(l) for l in losses]
 
     def test_batch(self, inputs):
-        super(Model, self.model).eval()
+        self.model.network.eval()
         self.mode = 'test'
         inputs = [to_variable(x) for x in to_list(inputs)]
-        outputs = self.model.forward(*inputs)
+        outputs = self.model.network.forward(*inputs)
         if self._nranks > 1 and isinstance(self.model._place, fluid.CUDAPlace):
             outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
 
         return [to_numpy(o) for o in to_list(outputs)]
 
     def parameters(self, *args, **kwargs):
-        return super(Model, self.model).parameters(*args, **kwargs)
+        return self.model.network.parameters(*args, **kwargs)
 
     def save(self, path):
-        params = self.model.state_dict()
+        params = self.model.network.state_dict()
         fluid.save_dygraph(params, path)
         if self.model._optimizer is None:
             return
@@ -614,7 +619,7 @@ def load(self, param_state_pairs, optim_state):
 
         opt_cls_name = self.model._optimizer.__class__.__name__
         opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
-        param_names = [param.name for param in self.model.parameters()]
+        param_names = [param.name for param in self.model.network.parameters()]
         for var_name, state_var in sorted(
                 optim_state.items(), key=lambda x: len(x[0]), reverse=True):
             if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
@@ -649,7 +654,7 @@ def load(self, param_state_pairs, optim_state):
         self.model._optimizer.set_dict(converted_state)
 
 
-class Model(fluid.dygraph.Layer):
+class Model(object):
     """
     An Model object is network with training and inference features.
     Dynamic graph and static graph are supported at the same time,
@@ -658,56 +663,79 @@ class Model(fluid.dygraph.Layer):
     instantiating a Model. The input description, i.e, hapi.Input,
     must be required for static graph.
 
+    Args:
+        network (fluid.dygraph.Layer): The network is an instance of
+            fluid.dygraph.Layer.
+        inputs (Input|list|dict|None): `inputs`, entry points of network,
+            could be a Input layer, or lits of Input layers,
+            or dict (name: Input), or None. For static graph,
+            inputs must be set. For dynamic graph, it could be None.
+        labels (Input|list|None): `labels`, entry points of network,
+            could be a Input layer or lits of Input layers, or None.
+            For static graph, if labels is required in loss_function,
+            labels must be set. Otherwise, it could be None.
+
+
     Usage:
         .. code-block:: python
 
-        import numpy as np
-        import paddle
         import paddle.fluid as fluid
-        #import paddle.incubate.hapi as hapi
-        from paddle.incubate.hapi import Model, Input, set_device
-        from paddle.incubate.hapi.loss import CrossEntropy
-        from paddle.incubate.hapi.dataset import MNIST
-
-        class MyModel(Model):
+        import paddle.incubate.hapi as hapi
+        
+        class MyNet(fluid.dygraph.Layer):
             def __init__(self):
-                super(MyModel, self).__init__()
-                self._fc = fluid.dygraph.Linear(784, 10, act='softmax')
+                super(MyNet, self).__init__()
+                self._fc1 = fluid.dygraph.Linear(784, 200, act='softmax')
             def forward(self, x):
-                y = self._fc(x)
+                y = self._fc1(x)
                 return y
-        device = set_device('gpu')
+        
+        device = hapi.set_device('gpu')
         # if use static graph, do not set
         fluid.enable_dygraph(device)
-        model = MyModel()
-        optim = fluid.optimizer.SGD(learning_rate=1e-3,
-            parameter_list=model.parameters())
         
-        inputs = [Input([None, 784], 'float32', name='x')]
-        labels = [Input([None, 1], 'int64', name='label')]
+        # inputs and labels are not required for dynamic graph.
+        input = hapi.Input('x', [None, 784], 'float32')
+        label = hapi.Input('label', [None, 1], 'int64')
         
-        mnist_data = MNIST(mode='train')
+        model = hapi.Model(MyNet(), input, label)
+        optim = fluid.optimizer.SGD(learning_rate=1e-3,
+            parameter_list=model.parameters())
         model.prepare(optim,
-                      CrossEntropy(average=True),
-                      hapi.metrics.Accuracy(),
-                      inputs,
-                      labels,
-                      device=device)
+                      hapi.loss.CrossEntropy(average=True),
+                      hapi.metrics.Accuracy())
+        
+        mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
         model.fit(mnist_data, epochs=2, batch_size=32, verbose=1)
+
     """
 
-    def __init__(self):
-        super(Model, self).__init__(self.__class__.__name__)
+    def __init__(self, network, inputs=None, labels=None):
         self.mode = 'train'
+        self.network = network
         self._inputs = None
         self._labels = None
         self._loss_function = None
         self._loss_weights = None
         self._optimizer = None
-        self._device = None
         self._optimizer = None
         self._test_dataloader = None
 
+        if not in_dygraph_mode():
+            if not isinstance(inputs, (list, dict, Input)):
+                raise TypeError(
+                    "'inputs' must be list or dict in static graph mode")
+        if inputs is None:
+            self._inputs = [Input(name=n) \
+                for n in extract_args(self.network.forward) if n != 'self']
+        elif isinstance(input, dict):
+            self._inputs = [inputs[n] \
+                for n in extract_args(self.network.forward) if n != 'self']
+        else:
+            self._inputs = to_list(inputs)
+
+        self._labels = to_list(labels)
+
         # init backend
         if fluid.in_dygraph_mode():
             self._adapter = DynamicGraphAdapter(self)
@@ -735,12 +763,12 @@ def train_batch(self, inputs, labels=None):
             
               import numpy as np
               import paddle.fluid as fluid
-              from paddle.incubate.hapi import Model, Input, set_device
+              import paddle.incubate.hapi as hapi
 
-              class MyModel(Model):
+              class MyNet(fluid.dygraph.Layer):
                   def __init__(self):
-                      super(MyModel, self).__init__()
-                      self._fc = Linear(784, 1, act='softmax')
+                      super(MyNet, self).__init__()
+                      self._fc = fluid.dygraph.Linear(784, 10, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
@@ -748,17 +776,12 @@ def forward(self, x):
               device = hapi.set_device('gpu')
               fluid.enable_dygraph(device)
 
-              model = MyModel()
+              input = hapi.Input('x', [None, 784], 'float32')
+              label = hapi.Input('label', [None, 1], 'int64')
+              model = hapi.Model(MyNet(), input, label)
               optim = fluid.optimizer.SGD(learning_rate=1e-3,
                   parameter_list=model.parameters())
-
-              inputs = [Input([None, 784], 'float32', name='x')]
-              labels = [Input([None, 1], 'int64', name='label')]
-              model.prepare(optim,
-                            CrossEntropy(average=True),
-                            inputs=inputs,
-                            labels=labels,
-                            device=device)
+              model.prepare(optim, hapi.loss.CrossEntropy(average=True))
               data = np.random.random(size=(4,784)).astype(np.float32)
               label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
               loss = model.train_batch([data], [label])
@@ -787,30 +810,26 @@ def eval_batch(self, inputs, labels=None):
             
               import numpy as np
               import paddle.fluid as fluid
-              from paddle.incubate.hapi import Model, Input, set_device
+              import paddle.incubate.hapi as hapi
 
-              class MyModel(Model):
+              class MyNet(fluid.dygraph.Layer):
                   def __init__(self):
-                      super(MyModel, self).__init__()
-                      self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
+                      super(MyNet, self).__init__()
+                      self._fc = fluid.dygraph.Linear(784, 10, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
-              device = set_device('gpu')
+              device = hapi.set_device('gpu')
               fluid.enable_dygraph(device)
 
-              model = MyModel()
+              input = hapi.Input('x', [None, 784], 'float32')
+              label = hapi.Input('label', [None, 1], 'int64')
+              model = hapi.Model(MyNet(), input, label)
               optim = fluid.optimizer.SGD(learning_rate=1e-3,
                   parameter_list=model.parameters())
-
-              inputs = [Input([None, 784], 'float32', name='x')]
-              labels = [Input([None, 1], 'int64', name='label')]
               model.prepare(optim,
-                            CrossEntropy(average=True),
-                            inputs=inputs,
-                            labels=labels,
-                            device=device)
+                            hapi.loss.CrossEntropy(average=True))
               data = np.random.random(size=(4,784)).astype(np.float32)
               label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
               loss = model.eval_batch([data], [label])
@@ -836,23 +855,21 @@ def test_batch(self, inputs):
             
               import numpy as np
               import paddle.fluid as fluid
-              from paddle.incubate.hapi import Model, Input, set_device
+              import paddle.incubate.hapi as hapi
 
-              class MyModel(Model):
+              class MyNet(fluid.dygraph.Layer):
                   def __init__(self):
-                      super(MyModel, self).__init__()
+                      super(MyNet, self).__init__()
                       self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
-              device = set_device('gpu')
+              device = hapi.set_device('gpu')
               fluid.enable_dygraph(device)
 
-              model = MyModel()
-              inputs = [Input([None, 784], 'float32', name='x')]
-              model.prepare(inputs=inputs,
-                            device=device)
+              model = hapi.Model(MyNet())
+              model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
               out = model.eval_batch([data])
               print(out)
@@ -886,19 +903,19 @@ def save(self, path):
             .. code-block:: python
             
               import paddle.fluid as fluid
-              from paddle.incubate.hapi import Model, set_device
+              import paddle.incubate.hapi as hapi
               
-              class MyModel(Model):
+              class MyNet(fluid.dygraph.Layer):
                   def __init__(self):
-                      super(MyModel, self).__init__()
+                      super(MyNet, self).__init__()
                       self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
               
-              device = set_device('cpu')
+              device = hapi.set_device('cpu')
               fluid.enable_dygraph(device)
-              model = MyModel()
+              model = hapi.Model(MyNet())
               model.save('checkpoint/test')
         """
         if ParallelEnv().local_rank == 0:
@@ -938,19 +955,19 @@ def load(self, path, skip_mismatch=False, reset_optimizer=False):
             .. code-block:: python
             
               import paddle.fluid as fluid
-              from paddle.incubate.hapi import Model, set_device
+              import paddle.incubate.hapi as hapi
               
-              class MyModel(Model):
+              class MyNet(fluid.dygraph.Layer):
                   def __init__(self):
-                      super(MyModel, self).__init__()
+                      super(MyNet, self).__init__()
                       self._fc = fluid.dygraph.Linear(784, 1, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
               
-              device = set_device('cpu')
+              device = hapi.set_device('cpu')
               fluid.enable_dygraph(device)
-              model = MyModel()
+              model = hapi.Model(MyNet())
               model.load('checkpoint/test')
         """
 
@@ -983,7 +1000,7 @@ def _strip_postfix(path):
         assert param_state, "Failed to load parameters, please check path."
 
         matched_param_state = []
-        for key, param in self.state_dict().items():
+        for key, param in self.network.state_dict().items():
             try:
                 match_res = _check_match(key, param)
             except ValueError as err:
@@ -1012,28 +1029,24 @@ def parameters(self, *args, **kwargs):
 
             .. code-block:: python
 
-              from paddle.incubate.hapi.model import Model, Input, set_device
-              class MyModel(Model):
+              import paddle.fluid as fluid
+              from paddle.incubate.hapi import Model
+
+              class MyNet(fluid.dygraph.Layer):
                   def __init__(self):
-                      super(MyModel, self).__init__()
+                      super(MyNet, self).__init__()
                       self._fc = fluid.dygraph.Linear(20, 10, act='softmax')
                   def forward(self, x):
                       y = self._fc(x)
                       return y
 
               fluid.enable_dygraph()
-              model = MyModel()
+              model = Model(MyNet())
               params = model.parameters()
         """
         return self._adapter.parameters()
 
-    def prepare(self,
-                optimizer=None,
-                loss_function=None,
-                metrics=None,
-                inputs=None,
-                labels=None,
-                device=None):
+    def prepare(self, optimizer=None, loss_function=None, metrics=None):
         """
         Configures the model before runing.
 
@@ -1046,32 +1059,13 @@ def prepare(self,
                 no loss.
             metrics (Metric|list of Metric|None): If metrics is set, all
                 metrics will be calculated and output in train/eval mode.
-            inputs (Input|list|dict|None): `inputs`, entry points of network,
-                could be a Input layer, or lits of Input layers,
-                or dict (name: Input), or None. For static graph,
-                inputs must be set. For dynamic graph, it could be None.
-            labels (Input|list|None): `labels`, entry points of network,
-                could be a Input layer or lits of Input layers, or None.
-                For static graph, if labels is required in loss_function,
-                labels must be set. Otherwise, it could be None.
-            device (str|fluid.CUDAPlace|fluid.CPUPlace|None): Specify device
-                type, 'CPU', 'GPU', fluid.CUDAPlace or fluid.CPUPlace.
-                If None, automatically select device according to
-                installation package version.
 
         Returns:
             None
         """
 
-        if isinstance(device, fluid.CUDAPlace) or \
-            (isinstance(device, six.string_types) and device.lower() == 'gpu') \
-            or (device is None and fluid.is_compiled_with_cuda()):
-            if isinstance(device, fluid.CUDAPlace):
-                self._place = device
-            else:
-                self._place = fluid.CUDAPlace(ParallelEnv().dev_id) \
-                    if ParallelEnv().nranks > 1 else fluid.CUDAPlace(0)
-
+        self._place = _get_device()
+        if isinstance(self._place, fluid.CUDAPlace):
             global _parallel_context_initialized
             if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
                 if fluid.in_dygraph_mode():
@@ -1088,27 +1082,13 @@ def prepare(self,
                     fluid.dygraph.parallel.prepare_context()
                 else:
                     prepare_distributed_context(self._place)
-
                 _parallel_context_initialized = True
-        elif isinstance(device, fluid.CPUPlace):
-            self._place = device
-        elif (isinstance(device, six.string_types) and device.lower() == 'cpu') \
-            or (device is None):
-            self._place = fluid.CPUPlace()
-        else:
-            raise ValueError(
-                "Expected device in ('gpu', 'cpu', fluid.CUDAPlace, fluid.CPUPlace, None), \
-                but got {}".format(device))
 
         self._optimizer = optimizer
         if loss_function:
             if not isinstance(loss_function, Loss):
                 raise TypeError("'loss_function' must be sub classes of 'Loss'")
         self._loss_function = loss_function
-        if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
-                raise TypeError(
-                    "'inputs' must be list or dict in static graph mode")
 
         metrics = metrics or []
         for metric in to_list(metrics):
@@ -1117,11 +1097,6 @@ def prepare(self,
                     metric.__class__.__name__)
         self._metrics = to_list(metrics)
 
-        self._inputs = to_list(inputs) if not isinstance(inputs, dict) else [
-            inputs[n] for n in extract_args(self.forward) if n != 'self'
-        ]
-        self._labels = to_list(labels)
-
         if not in_dygraph_mode():
             self._adapter.prepare()
 
@@ -1192,32 +1167,26 @@ def fit(
 
             .. code-block:: python
 
-              from paddle.incubate.hapi.model import Model, Input, set_device
-              from paddle.incubate.hapi.loss import CrossEntropy
-              from paddle.incubate.hapi.metrics import Accuracy
-              from paddle.incubate.hapi.datasets import MNIST
-              from paddle.incubate.hapi.vision.models import LeNet
+              import paddle.fluid as fluid
+              import paddle.incubate.hapi as hapi
 
               dynamic = True
-              device = set_device(FLAGS.device)
+              device = hapi.set_device('gpu')
               fluid.enable_dygraph(device) if dynamic else None
            
-              train_dataset = MNIST(mode='train')
-              val_dataset = MNIST(mode='test')
+              train_dataset = hapi.datasets.MNIST(mode='train')
+              val_dataset = hapi.datasets.MNIST(mode='test')
            
-              inputs = [Input([None, 1, 28, 28], 'float32', name='image')]
-              labels = [Input([None, 1], 'int64', name='label')]
+              input = hapi.Input('image', [None, 1, 28, 28], 'float32')
+              label = hapi.Input('label', [None, 1], 'int64')
            
-              model = LeNet()
+              model = hapi.Model(hapi.vision.LeNet(), input, label)
               optim = fluid.optimizer.Adam(
                   learning_rate=0.001, parameter_list=model.parameters())
               model.prepare(
                   optim,
-                  CrossEntropy(),
-                  Accuracy(topk=(1, 2)),
-                  inputs=inputs,
-                  labels=labels,
-                  device=device)
+                  hapi.loss.CrossEntropy(),
+                  hapi.metrics.Accuracy(topk=(1, 2)))
               model.fit(train_dataset,
                         val_dataset,
                         epochs=2,
@@ -1229,36 +1198,30 @@ def fit(
 
             .. code-block:: python
 
-              from paddle.incubate.hapi.model import Model, Input, set_device
-              from paddle.incubate.hapi.loss import CrossEntropy
-              from paddle.incubate.hapi.metrics import Accuracy
-              from paddle.incubate.hapi.datasets import MNIST
-              from paddle.incubate.hapi.vision.models import LeNet
+              import paddle.fluid as fluid
+              import paddle.incubate.hapi as hapi
 
               dynamic = True
-              device = set_device(FLAGS.device)
+              device = hapi.set_device('gpu')
               fluid.enable_dygraph(device) if dynamic else None
            
-              train_dataset = MNIST(mode='train')
+              train_dataset = hapi.datasets.MNIST(mode='train')
               train_loader = fluid.io.DataLoader(train_dataset,
                   places=device, batch_size=64)
-              val_dataset = MNIST(mode='test')
+              val_dataset = hapi.datasets.MNIST(mode='test')
               val_loader = fluid.io.DataLoader(val_dataset,
                   places=device, batch_size=64)
            
-              inputs = [Input([None, 1, 28, 28], 'float32', name='image')]
-              labels = [Input([None, 1], 'int64', name='label')]
+              input = hapi.Input('image', [None, 1, 28, 28], 'float32')
+              label = hapi.Input('label', [None, 1], 'int64')
            
-              model = LeNet()
+              model = hapi.Model(hapi.vision.LeNet(), input, label)
               optim = fluid.optimizer.Adam(
                   learning_rate=0.001, parameter_list=model.parameters())
               model.prepare(
                   optim,
-                  CrossEntropy(),
-                  Accuracy(topk=(1, 2)),
-                  inputs=inputs,
-                  labels=labels,
-                  device=device)
+                  hapi.loss.CrossEntropy(),
+                  hapi.metrics.Accuracy(topk=(1, 2)))
               model.fit(train_loader,
                         val_loader,
                         epochs=2,
@@ -1370,35 +1333,26 @@ def evaluate(
         Examples:
         .. code-block:: python
 
-            # declarative mode
-            import numpy as np
-            from paddle.incubate.hapi.metrics import Accuracy
-            from paddle.incubate.hapi.datasets import MNIST
-            from paddle.incubate.hapi.vision.transforms import Compose,Resize
-            from paddle.incubate.hapi.vision.models import LeNet
-            from paddle.incubate.hapi.model import Input, set_device
+            import paddle.fluid as fluid
+            import paddle.incubate.hapi as hapi
 
+            # declarative mode
+            val_dataset = hapi.datasets.MNIST(mode='test')
 
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-            labels = [Input([None, 1], 'int64', name='label')]
-
-            val_dataset = MNIST(mode='test')
-
-            model = LeNet()
-            model.prepare(metrics=Accuracy(), inputs=inputs, labels=labels)
+            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
+            label = hapi.Input('label', [None, 1], 'int64')
+            model = hapi.Model(hapi.vision.LeNet(), input, label)
+            model.prepare(metrics=hapi.metrics.Accuracy())
 
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
 
             # imperative mode
-            import paddle.fluid.dygraph as dg
-            place = set_device('cpu')
-            with dg.guard(place) as g:
-                model = LeNet()
-                model.prepare(metrics=Accuracy(), inputs=inputs, labels=labels)
-
-                result = model.evaluate(val_dataset, batch_size=64)
-                print(result)
+            fluid.enable_dygraph()
+            model = hapi.Model(hapi.vision.LeNet())
+            model.prepare(metrics=hapi.metrics.Accuracy())
+            result = model.evaluate(val_dataset, batch_size=64)
+            print(result)
                 
         """
 
@@ -1471,15 +1425,11 @@ def predict(self,
         Examples:
         .. code-block:: python
 
-            # declarative mode
             import numpy as np
-            from paddle.incubate.hapi.metrics import Accuracy
-            from paddle.incubate.hapi.datasets import MNIST
-            from paddle.incubate.hapi.vision.transforms import Compose,Resize
-            from paddle.incubate.hapi.vision.models import LeNet
-            from paddle.incubate.hapi.model import Input, set_device
+            import paddle.fluid as fluid
+            import paddle.incubate.hapi as hapi
 
-            class MnistDataset(MNIST):
+            class MnistDataset(hapi.datasets.MNIST):
                 def __init__(self, mode, return_label=True):
                     super(MnistDataset, self).__init__(mode=mode)
                     self.return_label = return_label
@@ -1493,25 +1443,23 @@ def __getitem__(self, idx):
                 def __len__(self):
                     return len(self.images)
 
-            inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-
             test_dataset = MnistDataset(mode='test', return_label=False)
 
-            model = LeNet()
-            model.prepare(inputs=inputs)
+            # declarative mode
+            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
+            model = hapi.Model(hapi.vision.LeNet(), input)
+            model.prepare()
 
             result = model.predict(test_dataset, batch_size=64)
-            print(result)
+            print(len(result[0]), result[0][0].shape)
 
             # imperative mode
-            import paddle.fluid.dygraph as dg
-            place = set_device('cpu')
-            with dg.guard(place) as g:
-                model = LeNet()
-                model.prepare(inputs=inputs)
-
-                result = model.predict(test_dataset, batch_size=64)
-                print(result)
+            device = hapi.set_device('cpu')
+            fluid.enable_dygraph(device)
+            model = hapi.Model(hapi.vision.LeNet())
+            model.prepare()
+            result = model.predict(test_dataset, batch_size=64)
+            print(len(result[0]), result[0][0].shape)
         """
 
         if test_data is not None and isinstance(test_data, Dataset):
@@ -1572,6 +1520,19 @@ def save_inference_model(self,
 
         Returns:
             list: The fetch variables' name list
+
+
+        Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.incubate.hapi as hapi
+
+            input = hapi.Input('image', [-1, 1, 28, 28], 'float32')
+            model = hapi.Model(hapi.vision.LeNet(), input)
+            model.prepare()
+
+            model.save_inference_model('inference_model')
         """
         assert not fluid.in_dygraph_mode(
         ), 'Save inference model must in static mode!'
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
index d8b7b978621b9..cbb41d0bbb9b5 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
@@ -22,7 +22,7 @@
 
 from paddle import fluid
 
-from paddle.incubate.hapi.model import Model, Input, set_device
+from paddle.incubate.hapi import Model, Input, set_device
 from paddle.incubate.hapi.loss import CrossEntropy
 from paddle.incubate.hapi.vision.models import LeNet
 from paddle.incubate.hapi.metrics import Accuracy
@@ -64,20 +64,19 @@ def test_static_multiple_gpus(self):
         im_shape = (-1, 1, 28, 28)
         batch_size = 128
 
-        inputs = [Input(im_shape, 'float32', name='image')]
-        labels = [Input([None, 1], 'int64', name='label')]
+        inputs = [Input('image', im_shape, 'float32')]
+        labels = [Input('label', [None, 1], 'int64')]
+
+        model = Model(LeNet(), inputs, labels)
+        optim = fluid.optimizer.Momentum(
+            learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
+        model.prepare(optim, CrossEntropy(), Accuracy())
 
         train_dataset = MnistDataset(mode='train')
         val_dataset = MnistDataset(mode='test')
         test_dataset = MnistDataset(mode='test', return_label=False)
 
-        model = LeNet()
-        optim = fluid.optimizer.Momentum(
-            learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
-        loss = CrossEntropy()
-        model.prepare(optim, loss, Accuracy(), inputs, labels, device=device)
         cbk = ProgBarLogger(50)
-
         model.fit(train_dataset,
                   val_dataset,
                   epochs=2,
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
index 31ba9104b7106..e407dd12d56fe 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
@@ -22,7 +22,7 @@
 
 from paddle import fluid
 
-from paddle.incubate.hapi.model import Model, Input, set_device
+from paddle.incubate.hapi import Model, Input, set_device
 from paddle.incubate.hapi.loss import CrossEntropy
 from paddle.incubate.hapi.vision.models import LeNet
 from paddle.incubate.hapi.metrics import Accuracy
@@ -63,20 +63,19 @@ def test_static_multiple_gpus(self):
         im_shape = (-1, 1, 28, 28)
         batch_size = 128
 
-        inputs = [Input(im_shape, 'float32', name='image')]
-        labels = [Input([None, 1], 'int64', name='label')]
+        inputs = [Input('image', im_shape, 'float32')]
+        labels = [Input('label', [None, 1], 'int64')]
+
+        model = Model(LeNet(), inputs, labels)
+        optim = fluid.optimizer.Momentum(
+            learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
+        model.prepare(optim, CrossEntropy(), Accuracy())
 
         train_dataset = MnistDataset(mode='train')
         val_dataset = MnistDataset(mode='test')
         test_dataset = MnistDataset(mode='test', return_label=False)
 
-        model = LeNet()
-        optim = fluid.optimizer.Momentum(
-            learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
-        loss = CrossEntropy()
-        model.prepare(optim, loss, Accuracy(), inputs, labels, device=device)
         cbk = ProgBarLogger(50)
-
         model.fit(train_dataset,
                   val_dataset,
                   epochs=2,
diff --git a/python/paddle/incubate/hapi/tests/test_callbacks.py b/python/paddle/incubate/hapi/tests/test_callbacks.py
index d8630038cd87f..2a8a470736d92 100644
--- a/python/paddle/incubate/hapi/tests/test_callbacks.py
+++ b/python/paddle/incubate/hapi/tests/test_callbacks.py
@@ -18,7 +18,7 @@
 import tempfile
 import shutil
 
-from paddle.incubate.hapi.model import Input
+from paddle.incubate.hapi.model import Model, Input
 from paddle.incubate.hapi.vision.models import LeNet
 from paddle.incubate.hapi.callbacks import config_callbacks
 
@@ -36,9 +36,9 @@ def run_callback(self):
         freq = 2
         eval_steps = 20
 
-        lenet = LeNet()
-        inputs = [Input([None, 1, 28, 28], 'float32', name='image')]
-        lenet.prepare(inputs=inputs)
+        inputs = [Input('image', [None, 1, 28, 28], 'float32')]
+        lenet = Model(LeNet(), inputs)
+        lenet.prepare()
 
         cbks = config_callbacks(
             model=lenet,
diff --git a/python/paddle/incubate/hapi/tests/test_model.py b/python/paddle/incubate/hapi/tests/test_model.py
index 9753c1838d126..6cbdf7498dbe4 100644
--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/incubate/hapi/tests/test_model.py
@@ -26,7 +26,8 @@
 from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
 from paddle.fluid.dygraph.base import to_variable
 
-from paddle.incubate.hapi.model import Model, Input, set_device
+import paddle.incubate.hapi as hapi
+from paddle.incubate.hapi import Model, Input
 from paddle.incubate.hapi.loss import CrossEntropy
 from paddle.incubate.hapi.metrics import Accuracy
 from paddle.incubate.hapi.datasets import MNIST
@@ -123,7 +124,7 @@ class TestModel(unittest.TestCase):
     def setUpClass(cls):
         if not fluid.is_compiled_with_cuda():
             self.skipTest('module not tested when ONLY_CPU compling')
-        cls.device = set_device('gpu')
+        cls.device = hapi.set_device('gpu')
         fluid.enable_dygraph(cls.device)
 
         sp_num = 1280
@@ -149,8 +150,8 @@ def setUpClass(cls):
 
         cls.acc1 = dynamic_evaluate(dy_lenet, cls.val_loader)
 
-        cls.inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-        cls.labels = [Input([None, 1], 'int64', name='label')]
+        cls.inputs = [Input('image', [-1, 1, 28, 28], 'float32')]
+        cls.labels = [Input('label', [None, 1], 'int64')]
 
         cls.save_dir = tempfile.mkdtemp()
         cls.weight_path = os.path.join(cls.save_dir, 'lenet')
@@ -189,15 +190,14 @@ def fit(self, dynamic):
         fluid.default_startup_program().random_seed = seed
         fluid.default_main_program().random_seed = seed
 
-        model = LeNet()
+        net = LeNet()
         optim_new = fluid.optimizer.Adam(
-            learning_rate=0.001, parameter_list=model.parameters())
+            learning_rate=0.001, parameter_list=net.parameters())
+        model = Model(net, inputs=self.inputs, labels=self.labels)
         model.prepare(
             optim_new,
             loss_function=CrossEntropy(average=False),
-            metrics=Accuracy(),
-            inputs=self.inputs,
-            labels=self.labels)
+            metrics=Accuracy())
         model.fit(self.train_dataset, batch_size=64, shuffle=False)
 
         result = model.evaluate(self.val_dataset, batch_size=64)
@@ -225,9 +225,8 @@ def fit(self, dynamic):
 
     def evaluate(self, dynamic):
         fluid.enable_dygraph(self.device) if dynamic else None
-        model = LeNet()
-        model.prepare(
-            metrics=Accuracy(), inputs=self.inputs, labels=self.labels)
+        model = Model(LeNet(), self.inputs, self.labels)
+        model.prepare(metrics=Accuracy())
         model.load(self.weight_path)
         result = model.evaluate(self.val_dataset, batch_size=64)
         np.testing.assert_allclose(result['acc'], self.acc1)
@@ -247,8 +246,8 @@ def evaluate(self, dynamic):
 
     def predict(self, dynamic):
         fluid.enable_dygraph(self.device) if dynamic else None
-        model = LeNet()
-        model.prepare(inputs=self.inputs)
+        model = Model(LeNet(), self.inputs)
+        model.prepare()
         model.load(self.weight_path)
         output = model.predict(
             self.test_dataset, batch_size=64, stack_outputs=True)
@@ -271,7 +270,7 @@ def predict(self, dynamic):
         fluid.disable_dygraph() if dynamic else None
 
 
-class MyModel(Model):
+class MyModel(fluid.dygraph.Layer):
     def __init__(self):
         super(MyModel, self).__init__()
         self._fc = Linear(20, 10, act='softmax')
@@ -310,28 +309,24 @@ def get_expect():
 
         ref = get_expect()
         for dynamic in [True, False]:
-            device = set_device('cpu')
+            device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
-            model = MyModel()
 
+            net = MyModel()
             optim2 = fluid.optimizer.SGD(learning_rate=0.001,
-                                         parameter_list=model.parameters())
+                                         parameter_list=net.parameters())
 
-            inputs = [Input([None, dim], 'float32', name='x')]
-            labels = [Input([None, 1], 'int64', name='label')]
-            model.prepare(
-                optim2,
-                loss_function=CrossEntropy(average=False),
-                inputs=inputs,
-                labels=labels,
-                device=device)
+            inputs = [Input('x', [None, dim], 'float32')]
+            labels = [Input('label', [None, 1], 'int64')]
+            model = Model(net, inputs, labels)
+            model.prepare(optim2, loss_function=CrossEntropy(average=False))
             loss, = model.train_batch([data], [label])
 
             np.testing.assert_allclose(loss.flatten(), ref.flatten())
             fluid.disable_dygraph() if dynamic else None
 
-    def test_test_batch(self, dynamic=True):
+    def test_test_batch(self):
         dim = 20
         data = np.random.random(size=(4, dim)).astype(np.float32)
 
@@ -346,32 +341,31 @@ def get_expect():
 
         ref = get_expect()
         for dynamic in [True, False]:
-            device = set_device('cpu')
+            device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
-            model = MyModel()
-            inputs = [Input([None, dim], 'float32', name='x')]
-            model.prepare(inputs=inputs, device=device)
+            net = MyModel()
+            inputs = [Input('x', [None, dim], 'float32')]
+            model = Model(net, inputs)
+            model.prepare()
             out, = model.test_batch([data])
 
-            np.testing.assert_allclose(out, ref)
+            np.testing.assert_allclose(out, ref, rtol=1e-6)
             fluid.disable_dygraph() if dynamic else None
 
     def test_save_load(self):
         path = tempfile.mkdtemp()
         for dynamic in [True, False]:
-            device = set_device('cpu')
+            device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
-            model = MyModel()
-            inputs = [Input([None, 20], 'float32', name='x')]
-            labels = [Input([None, 1], 'int64', name='label')]
+            net = MyModel()
+            inputs = [Input('x', [None, 20], 'float32')]
+            labels = [Input('label', [None, 1], 'int64')]
             optim = fluid.optimizer.SGD(learning_rate=0.001,
-                                        parameter_list=model.parameters())
+                                        parameter_list=net.parameters())
+            model = Model(net, inputs, labels)
             model.prepare(
-                inputs=inputs,
-                optimizer=optim,
-                loss_function=CrossEntropy(average=False),
-                labels=labels)
+                optimizer=optim, loss_function=CrossEntropy(average=False))
             model.save(path + '/test')
             model.load(path + '/test')
             shutil.rmtree(path)
@@ -379,82 +373,73 @@ def test_save_load(self):
 
     def test_dynamic_save_static_load(self):
         path = tempfile.mkdtemp()
-        # for dynamic in [True, False]:
-        device = set_device('cpu')
-        fluid.enable_dygraph(device)  #if dynamic else None
-        model = MyModel()
-        inputs = [Input([None, 20], 'float32', name='x')]
-        labels = [Input([None, 1], 'int64', name='label')]
+        # dynamic saving
+        device = hapi.set_device('cpu')
+        fluid.enable_dygraph(device)
+        model = Model(MyModel())
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(
-            inputs=inputs,
-            optimizer=optim,
-            loss_function=CrossEntropy(average=False),
-            labels=labels)
+            optimizer=optim, loss_function=CrossEntropy(average=False))
         model.save(path + '/test')
         fluid.disable_dygraph()
-        model = MyModel()
-        inputs = [Input([None, 20], 'float32', name='x')]
-        labels = [Input([None, 1], 'int64', name='label')]
+
+        inputs = [Input('x', [None, 20], 'float32')]
+        labels = [Input('label', [None, 1], 'int64')]
+        model = Model(MyModel(), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(
-            inputs=inputs,
-            optimizer=optim,
-            loss_function=CrossEntropy(average=False),
-            labels=labels)
+            optimizer=optim, loss_function=CrossEntropy(average=False))
         model.load(path + '/test')
         shutil.rmtree(path)
 
     def test_static_save_dynamic_load(self):
         path = tempfile.mkdtemp()
 
-        model = MyModel()
-        inputs = [Input([None, 20], 'float32', name='x')]
-        labels = [Input([None, 1], 'int64', name='label')]
+        net = MyModel()
+        inputs = [Input('x', [None, 20], 'float32')]
+        labels = [Input('label', [None, 1], 'int64')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
-                                    parameter_list=model.parameters())
+                                    parameter_list=net.parameters())
+        model = Model(net, inputs, labels)
         model.prepare(
-            inputs=inputs,
-            optimizer=optim,
-            loss_function=CrossEntropy(average=False),
-            labels=labels)
+            optimizer=optim, loss_function=CrossEntropy(average=False))
         model.save(path + '/test')
 
-        device = set_device('cpu')
+        device = hapi.set_device('cpu')
         fluid.enable_dygraph(device)  #if dynamic else None
 
-        model = MyModel()
-        inputs = [Input([None, 20], 'float32', name='x')]
-        labels = [Input([None, 1], 'int64', name='label')]
+        net = MyModel()
+        inputs = [Input('x', [None, 20], 'float32')]
+        labels = [Input('label', [None, 1], 'int64')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
-                                    parameter_list=model.parameters())
+                                    parameter_list=net.parameters())
+        model = Model(net, inputs, labels)
         model.prepare(
-            inputs=inputs,
-            optimizer=optim,
-            loss_function=CrossEntropy(average=False),
-            labels=labels)
+            optimizer=optim, loss_function=CrossEntropy(average=False))
         model.load(path + '/test')
         shutil.rmtree(path)
         fluid.disable_dygraph()
 
     def test_parameters(self):
         for dynamic in [True, False]:
-            device = set_device('cpu')
+            device = hapi.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
-            model = MyModel()
-            inputs = [Input([None, 20], 'float32', name='x')]
-            model.prepare(inputs=inputs)
+            net = MyModel()
+            inputs = [Input('x', [None, 20], 'float32')]
+            model = Model(net, inputs)
+            model.prepare()
             params = model.parameters()
             self.assertTrue(params[0].shape[0] == 20)
             self.assertTrue(params[0].shape[1] == 10)
             fluid.disable_dygraph() if dynamic else None
 
     def test_export_deploy_model(self):
-        model = LeNet()
-        inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
-        model.prepare(inputs=inputs)
+        net = LeNet()
+        inputs = [Input('image', [-1, 1, 28, 28], 'float32')]
+        model = Model(net, inputs)
+        model.prepare()
         save_dir = tempfile.mkdtemp()
         if not os.path.exists(save_dir):
             os.makedirs(save_dir)
@@ -476,7 +461,7 @@ def test_export_deploy_model(self):
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
 
-        np.testing.assert_allclose(results, ori_results)
+        np.testing.assert_allclose(results, ori_results, rtol=1e-6)
         shutil.rmtree(save_dir)
 
 
diff --git a/python/paddle/incubate/hapi/tests/test_pretrained_model.py b/python/paddle/incubate/hapi/tests/test_pretrained_model.py
new file mode 100644
index 0000000000000..588797322f4ab
--- /dev/null
+++ b/python/paddle/incubate/hapi/tests/test_pretrained_model.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.incubate.hapi.vision.models as models
+from paddle.incubate.hapi import Model, Input
+
+
+# test the predicted resutls of static graph and dynamic graph are equal
+# when used pretrained model
+class TestPretrainedModel(unittest.TestCase):
+    def infer(self, x, arch, dygraph=True):
+        if dygraph:
+            fluid.enable_dygraph()
+
+        net = models.__dict__[arch](pretrained=True, classifier_activation=None)
+        inputs = [Input('image', [None, 3, 224, 224], 'float32')]
+        model = Model(network=net, inputs=inputs)
+        model.prepare()
+        res = model.test_batch(x)
+
+        if dygraph:
+            fluid.disable_dygraph()
+        return res
+
+    def test_models(self):
+        arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18']
+        for arch in arches:
+            x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
+            y_dygraph = self.infer(x, arch)
+            y_static = self.infer(x, arch, dygraph=False)
+            np.testing.assert_allclose(y_dygraph, y_static)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_text.py b/python/paddle/incubate/hapi/tests/test_text.py
index ec056ff2c48f5..78f089b06a38d 100644
--- a/python/paddle/incubate/hapi/tests/test_text.py
+++ b/python/paddle/incubate/hapi/tests/test_text.py
@@ -23,7 +23,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Embedding, Linear, Layer
 from paddle.fluid.layers import BeamSearchDecoder
-from paddle.incubate.hapi.model import Model, Input, set_device
+from paddle.incubate.hapi import Model, Input, set_device
 from paddle.incubate.hapi.text import *
 
 
@@ -36,7 +36,7 @@ def setUpClass(cls):
         np.random.seed(cls._random_seed)
         random.seed(cls._random_seed)
 
-        cls.model_cls = type(cls.__name__ + "Model", (Model, ), {
+        cls.model_cls = type(cls.__name__ + "Model", (Layer, ), {
             "__init__": cls.model_init_wrapper(cls.model_init),
             "forward": cls.model_forward
         })
@@ -49,7 +49,7 @@ def tearDownClass(cls):
     @staticmethod
     def model_init_wrapper(func):
         def __impl__(self, *args, **kwargs):
-            Model.__init__(self)
+            Layer.__init__(self)
             func(self, *args, **kwargs)
 
         return __impl__
@@ -89,9 +89,10 @@ def _calc_output(self, place, mode="test", dygraph=True):
             fluid.disable_dygraph()
         fluid.default_main_program().random_seed = self._random_seed
         fluid.default_startup_program().random_seed = self._random_seed
-        model = self.model_cls(**self.attrs) if isinstance(
+        layer = self.model_cls(**self.attrs) if isinstance(
             self.attrs, dict) else self.model_cls(*self.attrs)
-        model.prepare(inputs=self.make_inputs(), device=place)
+        model = Model(layer, inputs=self.make_inputs())
+        model.prepare()
         if self.param_states:
             model.load(self.param_states, optim_state=None)
         return model.test_batch(self.inputs)
@@ -141,10 +142,7 @@ def model_forward(model, inputs):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[-1].shape[-1]],
-                "float32",
-                name="input"),
+            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
         ]
         return inputs
 
@@ -170,10 +168,7 @@ def model_forward(model, inputs):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[-1].shape[-1]],
-                "float32",
-                name="input"),
+            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
         ]
         return inputs
 
@@ -224,11 +219,8 @@ def model_forward(model, init_hidden, init_cell):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, self.inputs[0].shape[-1]], "float32",
-                name="init_hidden"),
-            Input(
-                [None, self.inputs[1].shape[-1]], "float32", name="init_cell"),
+            Input("init_hidden", [None, self.inputs[0].shape[-1]], "float32"),
+            Input("init_cell", [None, self.inputs[1].shape[-1]], "float32"),
         ]
         return inputs
 
@@ -280,14 +272,10 @@ def model_forward(model, enc_input, attn_bias):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[0].shape[-1]],
-                "float32",
-                name="enc_input"),
-            Input(
-                [None, self.inputs[1].shape[1], None, None],
-                "float32",
-                name="attn_bias"),
+            Input("enc_input", [None, None, self.inputs[0].shape[-1]],
+                  "float32"),
+            Input("attn_bias", [None, self.inputs[1].shape[1], None, None],
+                  "float32"),
         ]
         return inputs
 
@@ -348,22 +336,14 @@ def model_forward(model,
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[0].shape[-1]],
-                "float32",
-                name="dec_input"),
-            Input(
-                [None, None, self.inputs[0].shape[-1]],
-                "float32",
-                name="enc_output"),
-            Input(
-                [None, self.inputs[-1].shape[1], None, None],
-                "float32",
-                name="self_attn_bias"),
-            Input(
-                [None, self.inputs[-1].shape[1], None, None],
-                "float32",
-                name="cross_attn_bias"),
+            Input("dec_input", [None, None, self.inputs[0].shape[-1]],
+                  "float32"),
+            Input("enc_output", [None, None, self.inputs[0].shape[-1]],
+                  "float32"),
+            Input("self_attn_bias",
+                  [None, self.inputs[-1].shape[1], None, None], "float32"),
+            Input("cross_attn_bias",
+                  [None, self.inputs[-1].shape[1], None, None], "float32"),
         ]
         return inputs
 
@@ -451,14 +431,10 @@ def model_forward(model, enc_output, trg_src_attn_bias):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[0].shape[-1]],
-                "float32",
-                name="enc_output"),
-            Input(
-                [None, self.inputs[1].shape[1], None, None],
-                "float32",
-                name="trg_src_attn_bias"),
+            Input("enc_output", [None, None, self.inputs[0].shape[-1]],
+                  "float32"),
+            Input("trg_src_attn_bias",
+                  [None, self.inputs[1].shape[1], None, None], "float32"),
         ]
         return inputs
 
@@ -497,12 +473,9 @@ def model_forward(model, word, lengths, target=None):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None], "int64", name="word"),
-            Input(
-                [None], "int64", name="lengths"),
-            Input(
-                [None, None], "int64", name="target"),
+            Input("word", [None, None], "int64"),
+            Input("lengths", [None], "int64"),
+            Input("target", [None, None], "int64"),
         ]
         return inputs
 
@@ -544,10 +517,7 @@ def model_forward(self, inputs):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[-1].shape[-1]],
-                "float32",
-                name="input"),
+            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
         ]
         return inputs
 
@@ -573,10 +543,7 @@ def model_forward(model, inputs):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[-1].shape[-1]],
-                "float32",
-                name="input"),
+            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
         ]
         return inputs
 
@@ -612,10 +579,7 @@ def model_forward(model, inputs):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[-1].shape[-1]],
-                "float32",
-                name="input"),
+            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
         ]
         return inputs
 
@@ -645,10 +609,7 @@ def model_forward(model, inputs):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[-1].shape[-1]],
-                "float32",
-                name="input"),
+            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
         ]
         return inputs
 
@@ -684,10 +645,7 @@ def model_forward(model, inputs):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, None, self.inputs[-1].shape[-1]],
-                "float32",
-                name="input"),
+            Input("input", [None, None, self.inputs[-1].shape[-1]], "float32"),
         ]
         return inputs
 
@@ -722,9 +680,7 @@ def model_forward(model, inputs):
 
     def make_inputs(self):
         inputs = [
-            Input(
-                [None, self.inputs[-1].shape[1], None], "float32",
-                name="input"),
+            Input("input", [None, self.inputs[-1].shape[1], None], "float32"),
         ]
         return inputs
 
diff --git a/python/paddle/incubate/hapi/tests/test_transforms.py b/python/paddle/incubate/hapi/tests/test_transforms.py
index 197b8e6a4925a..087f2d1615fc9 100644
--- a/python/paddle/incubate/hapi/tests/test_transforms.py
+++ b/python/paddle/incubate/hapi/tests/test_transforms.py
@@ -23,6 +23,7 @@
 
 from paddle.incubate.hapi.datasets import DatasetFolder
 from paddle.incubate.hapi.vision.transforms import transforms
+import paddle.incubate.hapi.vision.transforms.functional as F
 
 
 class TestTransforms(unittest.TestCase):
@@ -100,6 +101,78 @@ def test_color_jitter(self):
         ])
         self.do_transform(trans)
 
+    def test_rotate(self):
+        trans = transforms.Compose([
+            transforms.RandomRotate(90),
+            transforms.RandomRotate([-10, 10]),
+            transforms.RandomRotate(
+                45, expand=True),
+            transforms.RandomRotate(
+                10, expand=True, center=(60, 80)),
+        ])
+        self.do_transform(trans)
+
+    def test_pad(self):
+        trans = transforms.Compose([transforms.Pad(2)])
+        self.do_transform(trans)
+
+        fake_img = np.random.rand(200, 150, 3).astype('float32')
+        trans_pad = transforms.Pad(10)
+        fake_img_padded = trans_pad(fake_img)
+        np.testing.assert_equal(fake_img_padded.shape, (220, 170, 3))
+        trans_pad1 = transforms.Pad([1, 2])
+        trans_pad2 = transforms.Pad([1, 2, 3, 4])
+        img = trans_pad1(fake_img)
+        img = trans_pad2(img)
+
+    def test_erase(self):
+        trans = transforms.Compose(
+            [transforms.RandomErasing(), transforms.RandomErasing(value=0.0)])
+        self.do_transform(trans)
+
+    def test_random_crop(self):
+        trans = transforms.Compose([
+            transforms.RandomCrop(200),
+            transforms.RandomCrop((140, 160)),
+        ])
+        self.do_transform(trans)
+
+        trans_random_crop1 = transforms.RandomCrop(224)
+        trans_random_crop2 = transforms.RandomCrop((140, 160))
+
+        fake_img = np.random.rand(500, 400, 3).astype('float32')
+        fake_img_crop1 = trans_random_crop1(fake_img)
+        fake_img_crop2 = trans_random_crop2(fake_img_crop1)
+
+        np.testing.assert_equal(fake_img_crop1.shape, (224, 224, 3))
+
+        np.testing.assert_equal(fake_img_crop2.shape, (140, 160, 3))
+
+        trans_random_crop_same = transforms.RandomCrop((140, 160))
+        img = trans_random_crop_same(fake_img_crop2)
+
+        trans_random_crop_bigger = transforms.RandomCrop((180, 200))
+        img = trans_random_crop_bigger(img)
+
+        trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
+        img = trans_random_crop_pad(img)
+
+    def test_grayscale(self):
+        trans = transforms.Compose([transforms.Grayscale()])
+        self.do_transform(trans)
+
+        trans_gray = transforms.Grayscale()
+        fake_img = np.random.rand(500, 400, 3).astype('float32')
+        fake_img_gray = trans_gray(fake_img)
+
+        np.testing.assert_equal(len(fake_img_gray.shape), 2)
+        np.testing.assert_equal(fake_img_gray.shape[0], 500)
+        np.testing.assert_equal(fake_img_gray.shape[1], 400)
+
+        trans_gray3 = transforms.Grayscale(3)
+        fake_img = np.random.rand(500, 400, 3).astype('float32')
+        fake_img_gray = trans_gray3(fake_img)
+
     def test_exception(self):
         trans = transforms.Compose([transforms.Resize(-1)])
 
@@ -123,6 +196,36 @@ def test_exception(self):
         with self.assertRaises(ValueError):
             transforms.BrightnessTransform(-1.0)
 
+        with self.assertRaises(ValueError):
+            transforms.Pad([1.0, 2.0, 3.0])
+
+        with self.assertRaises(TypeError):
+            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            F.pad(fake_img, '1')
+
+        with self.assertRaises(TypeError):
+            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            F.pad(fake_img, 1, {})
+
+        with self.assertRaises(TypeError):
+            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            F.pad(fake_img, 1, padding_mode=-1)
+
+        with self.assertRaises(ValueError):
+            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            F.pad(fake_img, [1.0, 2.0, 3.0])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomRotate(-2)
+
+        with self.assertRaises(ValueError):
+            transforms.RandomRotate([1, 2, 3])
+
+        with self.assertRaises(ValueError):
+            trans_gray = transforms.Grayscale(5)
+            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            trans_gray(fake_img)
+
     def test_info(self):
         str(transforms.Compose([transforms.Resize((224, 224))]))
         str(transforms.BatchCompose([transforms.Resize((224, 224))]))
diff --git a/python/paddle/incubate/hapi/tests/test_vision_models.py b/python/paddle/incubate/hapi/tests/test_vision_models.py
index 1981edd85af7e..16dbe431be801 100644
--- a/python/paddle/incubate/hapi/tests/test_vision_models.py
+++ b/python/paddle/incubate/hapi/tests/test_vision_models.py
@@ -16,7 +16,7 @@
 import numpy as np
 
 import paddle.incubate.hapi.vision.models as models
-from paddle.incubate.hapi.model import Input
+import paddle.incubate.hapi as hapi
 
 
 class TestVisonModels(unittest.TestCase):
@@ -24,13 +24,13 @@ def models_infer(self, arch, pretrained=False, batch_norm=False):
 
         x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
         if batch_norm:
-            model = models.__dict__[arch](pretrained=pretrained,
-                                          batch_norm=True)
+            net = models.__dict__[arch](pretrained=pretrained, batch_norm=True)
         else:
-            model = models.__dict__[arch](pretrained=pretrained)
-        inputs = [Input([None, 3, 224, 224], 'float32', name='image')]
+            net = models.__dict__[arch](pretrained=pretrained)
 
-        model.prepare(inputs=inputs)
+        input = hapi.Input('image', [None, 3, 224, 224], 'float32')
+        model = hapi.Model(net, input)
+        model.prepare()
 
         model.test_batch(x)
 
@@ -71,10 +71,9 @@ def test_resnet152(self):
         self.models_infer('resnet152')
 
     def test_lenet(self):
-        lenet = models.__dict__['LeNet']()
-
-        inputs = [Input([None, 1, 28, 28], 'float32', name='x')]
-        lenet.prepare(inputs=inputs)
+        input = hapi.Input('x', [None, 1, 28, 28], 'float32')
+        lenet = hapi.Model(models.__dict__['LeNet'](), input)
+        lenet.prepare()
 
         x = np.array(np.random.random((2, 1, 28, 28)), dtype=np.float32)
         lenet.test_batch(x)
diff --git a/python/paddle/incubate/hapi/vision/models/lenet.py b/python/paddle/incubate/hapi/vision/models/lenet.py
index 45094119f0790..db1d894b4aa5f 100644
--- a/python/paddle/incubate/hapi/vision/models/lenet.py
+++ b/python/paddle/incubate/hapi/vision/models/lenet.py
@@ -15,12 +15,10 @@
 import paddle.fluid as fluid
 from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
 
-from ...model import Model
-
 __all__ = ['LeNet']
 
 
-class LeNet(Model):
+class LeNet(fluid.dygraph.Layer):
     """LeNet model from
     `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_
 
diff --git a/python/paddle/incubate/hapi/vision/models/mobilenetv1.py b/python/paddle/incubate/hapi/vision/models/mobilenetv1.py
index ced7a0b61374c..5022a065a5975 100644
--- a/python/paddle/incubate/hapi/vision/models/mobilenetv1.py
+++ b/python/paddle/incubate/hapi/vision/models/mobilenetv1.py
@@ -17,7 +17,6 @@
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 
-from ...model import Model
 from ...download import get_weights_path_from_url
 
 __all__ = ['MobileNetV1', 'mobilenet_v1']
@@ -103,7 +102,7 @@ def forward(self, inputs):
         return y
 
 
-class MobileNetV1(Model):
+class MobileNetV1(fluid.dygraph.Layer):
     """MobileNetV1 model from
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_.
 
@@ -276,7 +275,8 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        model.load(weight_path)
+        param, _ = fluid.load_dygraph(weight_path)
+        model.load_dict(param)
 
     return model
 
diff --git a/python/paddle/incubate/hapi/vision/models/mobilenetv2.py b/python/paddle/incubate/hapi/vision/models/mobilenetv2.py
index 0b8a220726615..d5cbfc7b96114 100644
--- a/python/paddle/incubate/hapi/vision/models/mobilenetv2.py
+++ b/python/paddle/incubate/hapi/vision/models/mobilenetv2.py
@@ -18,7 +18,6 @@
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 
-from ...model import Model
 from ...download import get_weights_path_from_url
 
 __all__ = ['MobileNetV2', 'mobilenet_v2']
@@ -150,7 +149,7 @@ def forward(self, inputs):
         return y
 
 
-class MobileNetV2(Model):
+class MobileNetV2(fluid.dygraph.Layer):
     """MobileNetV2 model from
     `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
 
@@ -252,7 +251,8 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        model.load(weight_path)
+        param, _ = fluid.load_dygraph(weight_path)
+        model.load_dict(param)
 
     return model
 
diff --git a/python/paddle/incubate/hapi/vision/models/resnet.py b/python/paddle/incubate/hapi/vision/models/resnet.py
index fa6d77e9b1630..858934e1c179f 100644
--- a/python/paddle/incubate/hapi/vision/models/resnet.py
+++ b/python/paddle/incubate/hapi/vision/models/resnet.py
@@ -21,7 +21,6 @@
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.container import Sequential
 
-from ...model import Model
 from ...download import get_weights_path_from_url
 
 __all__ = [
@@ -166,7 +165,7 @@ def forward(self, inputs):
         return fluid.layers.relu(x)
 
 
-class ResNet(Model):
+class ResNet(fluid.dygraph.Layer):
     """ResNet model from
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
@@ -278,7 +277,9 @@ def _resnet(arch, Block, depth, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        model.load(weight_path)
+        param, _ = fluid.load_dygraph(weight_path)
+        model.set_dict(param)
+
     return model
 
 
diff --git a/python/paddle/incubate/hapi/vision/models/vgg.py b/python/paddle/incubate/hapi/vision/models/vgg.py
index 668b4431ebd7b..74e7228e5249f 100644
--- a/python/paddle/incubate/hapi/vision/models/vgg.py
+++ b/python/paddle/incubate/hapi/vision/models/vgg.py
@@ -16,7 +16,6 @@
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.container import Sequential
 
-from ...model import Model
 from ...download import get_weights_path_from_url
 
 __all__ = [
@@ -51,7 +50,7 @@ def forward(self, x):
         return out
 
 
-class VGG(Model):
+class VGG(fluid.dygraph.Layer):
     """VGG model from
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
 
@@ -144,7 +143,8 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        model.load(weight_path)
+        param, _ = fluid.load_dygraph(weight_path)
+        model.load_dict(param)
 
     return model
 
diff --git a/python/paddle/incubate/hapi/vision/transforms/functional.py b/python/paddle/incubate/hapi/vision/transforms/functional.py
index e19d5054ed902..f76aa6be8b4dd 100644
--- a/python/paddle/incubate/hapi/vision/transforms/functional.py
+++ b/python/paddle/incubate/hapi/vision/transforms/functional.py
@@ -15,8 +15,10 @@
 import sys
 import collections
 import random
+import math
 
 import cv2
+import numbers
 import numpy as np
 
 if sys.version_info < (3, 3):
@@ -26,7 +28,7 @@
     Sequence = collections.abc.Sequence
     Iterable = collections.abc.Iterable
 
-__all__ = ['flip', 'resize']
+__all__ = ['flip', 'resize', 'pad', 'rotate', 'to_grayscale']
 
 
 def flip(image, code):
@@ -99,3 +101,202 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
             return cv2.resize(img, (ow, oh), interpolation=interpolation)
     else:
         return cv2.resize(img, size[::-1], interpolation=interpolation)
+
+
+def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
+    """Pads the given CV Image on all sides with speficified padding mode and fill value.
+
+    Args:
+        img (np.ndarray): Image to be padded.
+        padding (int|tuple): Padding on each border. If a single int is provided this
+            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            this is the padding for the left, top, right and bottom borders
+            respectively.
+        fill (int|tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
+            ``constant`` means padding with a constant value, this value is specified with fill. 
+            ``edge`` means padding with the last value at the edge of the image. 
+            ``reflect`` means padding with reflection of image (without repeating the last value on the edge) 
+            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in reflect mode 
+            will result in ``[3, 2, 1, 2, 3, 4, 3, 2]``.
+            ``symmetric`` menas pads with reflection of image (repeating the last value on the edge)
+            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in symmetric mode 
+            will result in ``[2, 1, 1, 2, 3, 4, 4, 3]``.
+
+    Returns:
+        numpy ndarray: Padded image.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.incubate.hapi.vision.transforms.functional import pad
+
+            fake_img = np.random.rand(500, 500, 3).astype('float32')
+
+            fake_img = pad(fake_img, 2)
+            print(fake_img.shape)
+
+    """
+
+    if not isinstance(padding, (numbers.Number, list, tuple)):
+        raise TypeError('Got inappropriate padding arg')
+    if not isinstance(fill, (numbers.Number, str, list, tuple)):
+        raise TypeError('Got inappropriate fill arg')
+    if not isinstance(padding_mode, str):
+        raise TypeError('Got inappropriate padding_mode arg')
+
+    if isinstance(padding, collections.Sequence) and len(padding) not in [2, 4]:
+        raise ValueError(
+            "Padding must be an int or a 2, or 4 element tuple, not a " +
+            "{} element tuple".format(len(padding)))
+
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
+        'Expected padding mode be either constant, edge, reflect or symmetric, but got {}'.format(padding_mode)
+
+    PAD_MOD = {
+        'constant': cv2.BORDER_CONSTANT,
+        'edge': cv2.BORDER_REPLICATE,
+        'reflect': cv2.BORDER_DEFAULT,
+        'symmetric': cv2.BORDER_REFLECT
+    }
+
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    if isinstance(padding, collections.Sequence) and len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    if isinstance(padding, collections.Sequence) and len(padding) == 4:
+        pad_left, pad_top, pad_right, pad_bottom = padding
+
+    if isinstance(fill, numbers.Number):
+        fill = (fill, ) * (2 * len(img.shape) - 3)
+
+    if padding_mode == 'constant':
+        assert (len(fill) == 3 and len(img.shape) == 3) or (len(fill) == 1 and len(img.shape) == 2), \
+            'channel of image is {} but length of fill is {}'.format(img.shape[-1], len(fill))
+
+    img = cv2.copyMakeBorder(
+        src=img,
+        top=pad_top,
+        bottom=pad_bottom,
+        left=pad_left,
+        right=pad_right,
+        borderType=PAD_MOD[padding_mode],
+        value=fill)
+
+    return img
+
+
+def rotate(img,
+           angle,
+           interpolation=cv2.INTER_LINEAR,
+           expand=False,
+           center=None):
+    """Rotates the image by angle.
+
+    Args:
+        img (numpy.ndarray): Image to be rotated.
+        angle (float|int): In degrees clockwise order.
+        interpolation (int, optional):
+            interpolation: Interpolation method.
+        expand (bool|optional): Optional expansion flag.
+            If true, expands the output image to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (2-tuple|optional): Optional center of rotation.
+            Origin is the upper left corner.
+            Default is the center of the image.
+
+    Returns:
+        numpy ndarray: Rotated image.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.incubate.hapi.vision.transforms.functional import rotate
+
+            fake_img = np.random.rand(500, 500, 3).astype('float32')
+
+            fake_img = rotate(fake_img, 10)
+            print(fake_img.shape)
+    """
+    dtype = img.dtype
+
+    h, w, _ = img.shape
+    point = center or (w / 2, h / 2)
+    M = cv2.getRotationMatrix2D(point, angle=-angle, scale=1)
+
+    if expand:
+        if center is None:
+            cos = np.abs(M[0, 0])
+            sin = np.abs(M[0, 1])
+
+            nW = int((h * sin) + (w * cos))
+            nH = int((h * cos) + (w * sin))
+
+            M[0, 2] += (nW / 2) - point[0]
+            M[1, 2] += (nH / 2) - point[1]
+
+            dst = cv2.warpAffine(img, M, (nW, nH))
+        else:
+            xx = []
+            yy = []
+            for point in (np.array([0, 0, 1]), np.array([w - 1, 0, 1]),
+                          np.array([w - 1, h - 1, 1]), np.array([0, h - 1, 1])):
+                target = np.dot(M, point)
+                xx.append(target[0])
+                yy.append(target[1])
+            nh = int(math.ceil(max(yy)) - math.floor(min(yy)))
+            nw = int(math.ceil(max(xx)) - math.floor(min(xx)))
+
+            M[0, 2] += (nw - w) / 2
+            M[1, 2] += (nh - h) / 2
+            dst = cv2.warpAffine(img, M, (nw, nh), flags=interpolation)
+    else:
+        dst = cv2.warpAffine(img, M, (w, h), flags=interpolation)
+    return dst.astype(dtype)
+
+
+def to_grayscale(img, num_output_channels=1):
+    """Converts image to grayscale version of image.
+
+    Args:
+        img (numpy.ndarray): Image to be converted to grayscale.
+
+    Returns:
+        numpy.ndarray:  Grayscale version of the image.
+                        if num_output_channels == 1, returned image is single channel
+                        if num_output_channels == 3, returned image is 3 channel with r == g == b
+    
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.incubate.hapi.vision.transforms.functional import to_grayscale
+
+            fake_img = np.random.rand(500, 500, 3).astype('float32')
+
+            fake_img = to_grayscale(fake_img)
+            print(fake_img.shape)
+    """
+
+    if num_output_channels == 1:
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    elif num_output_channels == 3:
+        img = cv2.cvtColor(
+            cv2.cvtColor(img, cv2.COLOR_RGB2GRAY), cv2.COLOR_GRAY2RGB)
+    else:
+        raise ValueError('num_output_channels should be either 1 or 3')
+
+    return img
diff --git a/python/paddle/incubate/hapi/vision/transforms/transforms.py b/python/paddle/incubate/hapi/vision/transforms/transforms.py
index a99f7f99a93f4..90c6e279959b2 100644
--- a/python/paddle/incubate/hapi/vision/transforms/transforms.py
+++ b/python/paddle/incubate/hapi/vision/transforms/transforms.py
@@ -52,6 +52,11 @@
     "ContrastTransform",
     "HueTransform",
     "ColorJitter",
+    "RandomCrop",
+    "RandomErasing",
+    "Pad",
+    "RandomRotate",
+    "Grayscale",
 ]
 
 
@@ -125,7 +130,7 @@ class BatchCompose(object):
             import numpy as np
             from paddle.io import DataLoader
 
-            from paddle.incubate.hapi.model import set_device
+            from paddle.incubate.hapi import set_device
             from paddle.incubate.hapi.datasets import Flowers
             from paddle.incubate.hapi.vision.transforms import Compose, BatchCompose, Resize
 
@@ -756,17 +761,13 @@ class ColorJitter(object):
 
     Args:
         brightness: How much to jitter brightness.
-            Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
-            or the given [min, max]. Should be non negative numbers.
+            Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]. Should be non negative numbers.
         contrast: How much to jitter contrast.
-            Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
-            or the given [min, max]. Should be non negative numbers.
+            Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]. Should be non negative numbers.
         saturation: How much to jitter saturation.
-            Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
-            or the given [min, max]. Should be non negative numbers.
+            Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]. Should be non negative numbers.
         hue: How much to jitter hue.
-            Chosen uniformly from [-hue, hue] or the given [min, max].
-            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            Chosen uniformly from [-hue, hue]. Should have 0<= hue <= 0.5.
 
     Examples:
     
@@ -800,3 +801,342 @@ def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
 
     def __call__(self, img):
         return self.transforms(img)
+
+
+class RandomCrop(object):
+    """Crops the given CV Image at a random location.
+
+    Args:
+        size (sequence|int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+        padding (int|sequence|optional): Optional padding on each border
+            of the image. If a sequence of length 4 is provided, it is used to pad left, 
+            top, right, bottom borders respectively. Default: 0.
+        pad_if_needed (boolean|optional): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Default: False.
+    
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.incubate.hapi.vision.transforms import RandomCrop
+
+            transform = RandomCrop(224)
+
+            fake_img = np.random.rand(500, 500, 3).astype('float32')
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self, size, padding=0, pad_if_needed=False):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.padding = padding
+        self.pad_if_needed = pad_if_needed
+
+    def _get_params(self, img, output_size):
+        """Get parameters for ``crop`` for a random crop.
+
+        Args:
+            img (numpy.ndarray): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+
+        """
+        h, w, _ = img.shape
+        th, tw = output_size
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        try:
+            i = random.randint(0, h - th)
+        except ValueError:
+            i = random.randint(h - th, 0)
+        try:
+            j = random.randint(0, w - tw)
+        except ValueError:
+            j = random.randint(w - tw, 0)
+        return i, j, th, tw
+
+    def __call__(self, img):
+        """
+
+        Args:
+            img (numpy.ndarray): Image to be cropped.
+        Returns:
+            numpy.ndarray: Cropped image.
+
+        """
+        if self.padding > 0:
+            img = F.pad(img, self.padding)
+
+        # pad the width if needed
+        if self.pad_if_needed and img.shape[1] < self.size[1]:
+            img = F.pad(img, (int((1 + self.size[1] - img.shape[1]) / 2), 0))
+        # pad the height if needed
+        if self.pad_if_needed and img.shape[0] < self.size[0]:
+            img = F.pad(img, (0, int((1 + self.size[0] - img.shape[0]) / 2)))
+
+        i, j, h, w = self._get_params(img, self.size)
+
+        return img[i:i + h, j:j + w]
+
+
+class RandomErasing(object):
+    """Randomly selects a rectangle region in an image and erases its pixels.
+    ``Random Erasing Data Augmentation`` by Zhong et al.
+    See https://arxiv.org/pdf/1708.04896.pdf
+
+    Args:
+         prob (float): probability that the random erasing operation will be performed.
+         scale (tuple): range of proportion of erased area against input image. Should be (min, max).
+         ratio (float): range of aspect ratio of erased area.
+         value (float|list|tuple): erasing value. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively. Default: 0. 
+
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.incubate.hapi.vision.transforms import RandomCrop
+
+            transform = RandomCrop(224)
+
+            fake_img = np.random.rand(500, 500, 3).astype('float32')
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 scale=(0.02, 0.4),
+                 ratio=0.3,
+                 value=[0., 0., 0.]):
+        assert isinstance(value, (
+            float, Sequence
+        )), "Expected type of value in [float, list, tupue], but got {}".format(
+            type(value))
+        assert scale[0] <= scale[1], "scale range should be of kind (min, max)!"
+
+        if isinstance(value, float):
+            self.value = [value, value, value]
+        else:
+            self.value = value
+
+        self.p = prob
+        self.scale = scale
+        self.ratio = ratio
+
+    def __call__(self, img):
+        if random.uniform(0, 1) > self.p:
+            return img
+
+        for _ in range(100):
+            area = img.shape[0] * img.shape[1]
+
+            target_area = random.uniform(self.scale[0], self.scale[1]) * area
+            aspect_ratio = random.uniform(self.ratio, 1 / self.ratio)
+
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w < img.shape[1] and h < img.shape[0]:
+                x1 = random.randint(0, img.shape[0] - h)
+                y1 = random.randint(0, img.shape[1] - w)
+
+                if len(img.shape) == 3 and img.shape[2] == 3:
+                    img[x1:x1 + h, y1:y1 + w, 0] = self.value[0]
+                    img[x1:x1 + h, y1:y1 + w, 1] = self.value[1]
+                    img[x1:x1 + h, y1:y1 + w, 2] = self.value[2]
+                else:
+                    img[x1:x1 + h, y1:y1 + w] = self.value[1]
+                return img
+
+        return img
+
+
+class Pad(object):
+    """Pads the given CV Image on all sides with the given "pad" value.
+
+    Args:
+        padding (int|list|tuple): Padding on each border. If a single int is provided this
+            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            this is the padding for the left, top, right and bottom borders
+            respectively.
+        fill (int|list|tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
+            ``constant`` means pads with a constant value, this value is specified with fill. 
+            ``edge`` means pads with the last value at the edge of the image. 
+            ``reflect`` means pads with reflection of image (without repeating the last value on the edge) 
+            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in reflect mode 
+            will result in ``[3, 2, 1, 2, 3, 4, 3, 2]``.
+            ``symmetric`` menas pads with reflection of image (repeating the last value on the edge)
+            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in symmetric mode 
+            will result in ``[2, 1, 1, 2, 3, 4, 4, 3]``.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.incubate.hapi.vision.transforms import Pad
+
+            transform = Pad(2)
+
+            fake_img = np.random.rand(500, 500, 3).astype('float32')
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self, padding, fill=0, padding_mode='constant'):
+        assert isinstance(padding, (numbers.Number, list, tuple))
+        assert isinstance(fill, (numbers.Number, str, list, tuple))
+        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+        if isinstance(padding,
+                      collections.Sequence) and len(padding) not in [2, 4]:
+            raise ValueError(
+                "Padding must be an int or a 2, or 4 element tuple, not a " +
+                "{} element tuple".format(len(padding)))
+
+        self.padding = padding
+        self.fill = fill
+        self.padding_mode = padding_mode
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray): Image to be padded.
+        Returns:
+            numpy.ndarray: Padded image.
+        """
+        return F.pad(img, self.padding, self.fill, self.padding_mode)
+
+
+class RandomRotate(object):
+    """Rotates the image by angle.
+
+    Args:
+        degrees (sequence or float or int): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees) clockwise order.
+        interpolation (int|optional): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        expand (bool|optional): Optional expansion flag. Default: False.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (2-tuple|optional): Optional center of rotation.
+            Origin is the upper left corner.
+            Default is the center of the image.
+    
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.incubate.hapi.vision.transforms import RandomRotate
+
+            transform = RandomRotate(90)
+
+            fake_img = np.random.rand(500, 400, 3).astype('float32')
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self,
+                 degrees,
+                 interpolation=cv2.INTER_LINEAR,
+                 expand=False,
+                 center=None):
+        if isinstance(degrees, numbers.Number):
+            if degrees < 0:
+                raise ValueError(
+                    "If degrees is a single number, it must be positive.")
+            self.degrees = (-degrees, degrees)
+        else:
+            if len(degrees) != 2:
+                raise ValueError(
+                    "If degrees is a sequence, it must be of len 2.")
+            self.degrees = degrees
+
+        self.interpolation = interpolation
+        self.expand = expand
+        self.center = center
+
+    def _get_params(self, degrees):
+        """Get parameters for ``rotate`` for a random rotation.
+        Returns:
+            sequence: params to be passed to ``rotate`` for random rotation.
+        """
+        angle = random.uniform(degrees[0], degrees[1])
+
+        return angle
+
+    def __call__(self, img):
+        """
+            img (np.ndarray): Image to be rotated.
+        Returns:
+            np.ndarray: Rotated image.
+        """
+
+        angle = self._get_params(self.degrees)
+
+        return F.rotate(img, angle, self.interpolation, self.expand,
+                        self.center)
+
+
+class Grayscale(object):
+    """Converts image to grayscale.
+
+    Args:
+        output_channels (int): (1 or 3) number of channels desired for output image
+    Returns:
+        CV Image: Grayscale version of the input.
+        - If output_channels == 1 : returned image is single channel
+        - If output_channels == 3 : returned image is 3 channel with r == g == b
+
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.incubate.hapi.vision.transforms import Grayscale
+
+            transform = Grayscale()
+
+            fake_img = np.random.rand(500, 400, 3).astype('float32')
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self, output_channels=1):
+        self.output_channels = output_channels
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray): Image to be converted to grayscale.
+        Returns:
+            numpy.ndarray: Randomly grayscaled image.
+        """
+        return F.to_grayscale(img, num_output_channels=self.output_channels)
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 62afe63471693..8ffe9613995a8 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -105,6 +105,7 @@
 from .math import atan  #DEFINE_ALIAS
 from .math import ceil  #DEFINE_ALIAS
 from .math import cos  #DEFINE_ALIAS
+from .math import cosh  #DEFINE_ALIAS
 from .math import cumsum  #DEFINE_ALIAS
 from .math import elementwise_add  #DEFINE_ALIAS
 from .math import elementwise_div  #DEFINE_ALIAS
@@ -112,14 +113,12 @@
 from .math import elementwise_max  #DEFINE_ALIAS
 from .math import elementwise_min  #DEFINE_ALIAS
 from .math import elementwise_mod  #DEFINE_ALIAS
-from .math import elementwise_mul  #DEFINE_ALIAS
 from .math import elementwise_pow  #DEFINE_ALIAS
 from .math import elementwise_sub  #DEFINE_ALIAS
 from .math import exp  #DEFINE_ALIAS
 from .math import floor  #DEFINE_ALIAS
 from .math import increment  #DEFINE_ALIAS
 from .math import log  #DEFINE_ALIAS
-from .math import mul  #DEFINE_ALIAS
 from .math import multiplex  #DEFINE_ALIAS
 from .math import pow  #DEFINE_ALIAS
 from .math import reciprocal  #DEFINE_ALIAS
@@ -132,6 +131,7 @@
 from .math import scale  #DEFINE_ALIAS
 from .math import sign  #DEFINE_ALIAS
 from .math import sin  #DEFINE_ALIAS
+from .math import sinh  #DEFINE_ALIAS
 from .math import sqrt  #DEFINE_ALIAS
 from .math import square  #DEFINE_ALIAS
 from .math import stanh  #DEFINE_ALIAS
@@ -143,6 +143,7 @@
 from .math import min  #DEFINE_ALIAS
 from .math import mm  #DEFINE_ALIAS
 from .math import div  #DEFINE_ALIAS
+from .math import multiply  #DEFINE_ALIAS
 from .math import add  #DEFINE_ALIAS
 from .math import atan  #DEFINE_ALIAS
 from .math import logsumexp  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 4de8b90c02928..ed104b5f3e702 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -21,14 +21,15 @@
 from ..fluid.framework import convert_np_dtype_to_dtype_, in_dygraph_mode, _varbase_creator, device_guard, OpProtoHolder
 from ..fluid.layers import fill_constant
 from paddle.common_ops_import import *
+import paddle
 
 # TODO: define functions to get create a tensor  
 from ..fluid.layers import crop_tensor  #DEFINE_ALIAS
 from ..fluid.layers import diag  #DEFINE_ALIAS
-from ..fluid.layers import eye  #DEFINE_ALIAS
 from ..fluid.layers import fill_constant  #DEFINE_ALIAS
 from ..fluid.layers import create_tensor  #DEFINE_ALIAS
 from ..fluid.layers import linspace  #DEFINE_ALIAS
+import paddle
 
 __all__ = [
     'create_tensor',
@@ -74,6 +75,9 @@ def full_like(x, fill_value, dtype=None, name=None):
     Returns:
         out(Variable): The Tensor variable storing the output.
     
+    Raises:
+        TypeError: The dtype must be one of bool, float16, float32, float64, int32, int64 and None.
+    
     Examples:
         .. code-block:: python
 
@@ -83,7 +87,8 @@ def full_like(x, fill_value, dtype=None, name=None):
           paddle.enable_imperative()  # Now we are in imperative mode 
           input = paddle.full(shape=[2, 3], fill_value=0.0, dtype='float32', name='input')
           output = paddle.full_like(input, 2.0)
-          #output result : [array([[2., 2., 2.], [2., 2., 2.]], dtype=float32)]
+          # [[2. 2. 2.]
+          #  [2. 2. 2.]]
     """
 
     if dtype is None:
@@ -98,7 +103,7 @@ def full_like(x, fill_value, dtype=None, name=None):
     helper = LayerHelper("full_like", **locals())
     check_dtype(dtype, 'dtype',
                 ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'full_like/zeros_like')
+                'full_like')
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
     helper.append_op(
@@ -111,7 +116,7 @@ def full_like(x, fill_value, dtype=None, name=None):
     return out
 
 
-def ones(shape, dtype=None, out=None, device=None):
+def ones(shape, dtype=None, name=None):
     """
 	:alias_main: paddle.ones
 	:alias: paddle.ones,paddle.tensor.ones,paddle.tensor.creation.ones
@@ -119,38 +124,44 @@ def ones(shape, dtype=None, out=None, device=None):
     The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 1.
 
     Args:
-        shape(tuple|list): Shape of output tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor, it supports
-            bool, float16, float32, float64, int32 and int64.
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result.
-        device(str, optional): Which device to run the operator. The :attr:`device` must be
-            None,'cpu', 'gpu'. If :attr:`device` is None, it will be choose the device that the user set in 
-            the paddle program. Default value is False.
-
+        shape(tuple|list|Variable): Shape of output tensor, the data type of shape is int32 or int64.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of output tensor, it supports
+            bool, float16, float32, float64, int32 and int64. Default: if None, the data type is 'float32'.
+        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+    
     Returns:
         Variable: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 1.
 
+    Raises:
+        TypeError: The dtype must be one of bool, float16, float32, float64, int32, int64 and None
+        and the data type of out Tensor must be the same as the dtype. 
+        TypeError: The `shape` must be one of list, tuple and Variable.
+    
     Examples:
         .. code-block:: python
 
-          import paddle
-          data = paddle.ones(shape=[3, 2], dtype='float32') # [[1., 1.], [1., 1.], [1., 1.]]
-          data = paddle.ones(shape=[2, 2], dtype='float32', device='cpu') # [[1., 1.], [1., 1.]]
+          import paddle 
+          paddle.enable_imperative()
+          
+          #default dtype for ones OP
+          data1 = paddle.ones(shape=[3, 2]) 
+          # [[1. 1.]
+          #  [1. 1.]
+          #  [1. 1.]]
+          
+          data2 = paddle.ones(shape=[2, 2], dtype='int32') 
+          # [[1 1]
+          #  [1 1]]
+          
+          #shape is a Variable
+          shape = paddle.fill_constant(shape=[2], dtype='int32', value=2)
+          data3 = paddle.ones(shape=shape, dtype='int32') 
+          # [[1 1]
+          #  [1 1]]
     """
-    check_dtype(dtype, 'create data type',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'zeros')
-
-    if device is not None:
-        if device not in ['cpu', 'gpu']:
-            raise ValueError(
-                "The value of 'device' in zeros_op must be cpu or gpu, but received %s."
-                % (device))
-        with fluid.device_guard(device):
-            return fill_constant(value=1.0, shape=shape, dtype=dtype, out=out)
-    return fill_constant(value=1.0, shape=shape, dtype=dtype, out=out)
+    if dtype is None:
+        dtype = 'float32'
+    return fill_constant(value=1.0, shape=shape, dtype=dtype, name=name)
 
 
 def ones_like(input, dtype=None, device=None, name=None):
@@ -294,67 +305,50 @@ def zeros_like(x, dtype=None, name=None):
     return full_like(x=x, fill_value=0, dtype=dtype, name=name)
 
 
-def eye(num_rows,
-        num_columns=None,
-        out=None,
-        dtype='float32',
-        stop_gradient=True,
-        name=None):
+def eye(num_rows, num_columns=None, dtype=None, name=None):
     """
-    **eye**
-    This function constructs an identity tensor.
+    This function constructs 2-D Tensor with ones on the diagonal and zeros elsewhere.
 
     Args:
         num_rows(int): the number of rows in each batch tensor.
         num_columns(int, optional): the number of columns in each batch tensor.
-                          If None, default: num_rows.
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result.
-        dtype(string, optional): The data type of the returned tensor.
-                       It should be int32, int64, float16, float32, float64.
-        stop_gradient(bool, optional): Whether stop calculating gradients. Default:True.
+            If None, default: num_rows.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of the returned tensor.
+            It should be int32, int64, float16, float32, float64. Default: if None, the data type
+            is float32.
         name(str, optional): The default value is None.  Normally there is no need for 
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
         Variable: An identity Tensor or LoDTensor of shape [num_rows, num_columns].
+    
+    Raises:
+        TypeError: The `dtype` must be one of float16, float32, float64, int32 int64 and None.
+        TypeError: The `num_columns` must be non-negative int.
 
     Examples:
         .. code-block:: python
           import paddle
+
+          paddle.enable_imperative()  # Now we are in imperative mode
           data = paddle.eye(3, dtype='int32')
-          # [[1, 0, 0]
-          #  [0, 1, 0]
-          #  [0, 0, 1]]
+          # [[1 0 0]
+          #  [0 1 0]
+          #  [0 0 1]]
           data = paddle.eye(2, 3, dtype='int32')
-          # [[1, 0, 0]
-          #  [0, 1, 0]]
+          # [[1 0 0]
+          #  [0 1 0]]
     """
 
-    helper = LayerHelper("eye", **locals())
-    if not isinstance(num_rows, int) or num_rows < 0:
-        raise TypeError("num_rows should be a non-negative int")
-    if num_columns is not None:
-        if not isinstance(num_columns, int) or num_columns < 0:
-            raise TypeError("num_columns should be a non-negative int")
-    else:
+    if dtype is None:
+        dtype = 'float32'
+    if num_columns is None:
         num_columns = num_rows
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-    c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(
-        type='eye',
-        inputs={},
-        outputs={'Out': [out]},
-        attrs={
-            'num_rows': num_rows,
-            'num_columns': num_columns,
-            'dtype': c_dtype
-        },
-        stop_gradient=True)
-    out.stop_gradient = stop_gradient
-    return out
+    return paddle.fluid.layers.eye(num_rows=num_rows,
+                                   num_columns=num_columns,
+                                   batch_shape=None,
+                                   dtype=dtype,
+                                   name=name)
 
 
 def full(shape, fill_value, dtype=None, name=None):
@@ -382,7 +376,7 @@ def full(shape, fill_value, dtype=None, name=None):
 
     Raises:
         TypeError: The `dtype` must be one of None, bool, float16, float32, float64, int32 and int64.
-        TypeError: The `shape` must be one of Variable, list tuple.
+        TypeError: The `shape` must be one of Variable, list and tuple.
     
     Examples:
         .. code-block:: python
@@ -390,99 +384,108 @@ def full(shape, fill_value, dtype=None, name=None):
           import paddle
 
           paddle.enable_imperative()  # Now we are in imperative mode
-          data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') # data1=[[0],[0]]
+          data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') 
+          #[[0]
+          # [0]]
 
           # attr shape is a list which contains Variable Tensor.
           positive_2 = paddle.fill_constant([1], "int32", 2)
-          data3 = paddle.full(shape=[1, positive_2], dtype='float32', fill_value=1.5) # data3=[1.5, 1.5]
+          data3 = paddle.full(shape=[1, positive_2], dtype='float32', fill_value=1.5)
+          # [[1.5 1.5]]
 
           # attr shape is an Variable Tensor.
-          shape = paddle.fill_constant([2], "int32", 2) # shape=[2,2]
-          data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) # data4=[[True,True],[True,True]]
+          shape = paddle.fill_constant([2], "int32", 2)
+          data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) 
+          # [[True True] 
+          #  [True True]]
           
-          # attr value is an Variable Tensor.
-          val = paddle.fill_constant([1], "float32", 2.0) # val=[2.0]
-          data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32') #data5=[[2.0],[2.0]]
+          # attr fill_value is an Variable Tensor.
+          val = paddle.fill_constant([1], "float32", 2.0)
+          data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32')
+          # [[2.0] 
+          #  [2.0]]
     """
 
-    helper = LayerHelper("full", **locals())
-
     if dtype is None:
         dtype = 'float32'
 
     return fill_constant(shape=shape, dtype=dtype, value=fill_value, name=name)
 
 
-def arange(start, end, step=1, dtype=None, name=None):
+def arange(start=0, end=None, step=1, dtype=None, name=None):
     """
 	:alias_main: paddle.arange
 	:alias: paddle.arange,paddle.tensor.arange,paddle.tensor.creation.arange
 
     Return evenly spaced values within a given interval.
 
-    Values are generated within the half-open interval [start, stop) (in other words,
-    the interval including start but excluding stop).
+    Values are generated into the half-open interval [start, stop) with the step.
+    (the interval including start but excluding stop).
+
+    If dtype is float32 or float64, we advise adding a small epsilon to end to
+    avoid floating point rounding errors when comparing against end.
 
     Parameters:
-        start(float32 | float64 | int32 | int64 | Variable): Start of interval. The interval includes this value.
-            when start is Variable, it is a 1-D Tensor with shape [1].
-        end(float32 | float64 | int32 | int64 | Variable): End of interval. The interval does not include this
-                                 value, except in some cases where step is not an integer
-                                 and floating point round-off affects the length of out. When end is Variable,
-                                 it is a 1-D Tensor with shape [1].
-        step(float32 | float64 | int32 | int64 | Variable): Spacing between values. For any output out, this is the
-                                  distance between two adjacent values, out[i+1] - out[i].
-        dtype(str|core.VarDesc.VarType): the data type of the output tensor, can be float32, float64, int32, int64.
-
-    Returns: a 1-D Tensor which is evenly spaced values within a given interval. Its data type is set by dtype.
+        start(float|int|Variable): Start of interval. The interval includes
+            this value. If end is None, the half-open interval is [0, start).
+            If start is Variable, it is a 1-D Tensor with shape [1], and it's
+            data type should be one of int32, int64, float32, float64. Default
+            is 0.
+        end(float|int|Variable, optional): End of interval. The interval does
+            not include this value. When end is Variable, it is a 1-D Tensor
+            with shape [1], and it's data type should be one of int32, int64,
+            float32, float64. If end is None, the half-open interval is [0, start).
+            Default is None.
+        step(float|int|Variable, optional): Spacing between values. For any
+            out, this is the istance between two adjacent values, out[i+1] - out[i].
+            When end is Variable, it is a 1-D Tensor with shape [1], and it's
+            data type should be one of int32, int64, float32, float64. Default is 1.
+        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
+            the output tensor, can be float32, float64, int32, int64. If dtype
+            is `None` , the data type of out tensor is `int64` . Defaule is None
+        name(str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+            Default is None.
+
+    Returns: a 1-D Tensor which is evenly spaced values within a given interval.
+        Its data type is set by dtype.
     
     Return type: Variable
 
+    Raises:
+        TypeError: If dtype is not float32, float64, int32 or int64.
+
     examples:
 
         .. code-block:: python
 
-             import paddle
-             # expected out put: [0, 2, 4, 6, 8]
-             data = paddle.arange(0, 10, 2, 'int32')
-
-         #dygraph mode
-             import paddle
-             import paddle.fluid as fluid
-             with fluid.dygraph.guard():
-                 x = paddle.arange(0, 6, 2) 
-                 # x: [0, 2, 4]
-                 # x dtype: float32
-             
-    """
-    helper = LayerHelper("range", **locals())
-
-    if dtype is None:
-        dtype = 'float32'
+        import paddle
+        import numpy as np
 
-    check_dtype(dtype, 'create data type',
-                ['float32', 'float64', 'int32', 'int64'], 'range')
+        paddle.enable_imperative()
 
-    dtype = convert_dtype(dtype)
-    if not isinstance(start, Variable):
-        start = fill_constant([1], dtype, start)
+        out1 = paddle.arange(5)
+        # [0, 1, 2, 3, 4]
 
-    if not isinstance(end, Variable):
-        end = fill_constant([1], dtype, end)
+        out2 = paddle.arange(3, 9, 2.0)
+        # [3, 5, 7]
 
-    if not isinstance(step, Variable):
-        step = fill_constant([1], dtype, step)
+        # use 4.999 instead of 5.0 to avoid floating point rounding errors
+        out3 = paddle.arange(4.999, dtype='float32')
+        # [0., 1., 2., 3., 4.]
 
-    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+        start_var = paddle.imperative.to_variable(np.array([3]))
+        out4 = paddle.arange(start_var, 7)
+        # [3, 4, 5, 6]
+             
+    """
+    if dtype is None:
+        dtype = 'int64'
+    if end is None:
+        end = start
+        start = 0
 
-    helper.append_op(
-        type='range',
-        inputs={'Start': start,
-                'End': end,
-                'Step': step},
-        outputs={'Out': [out]})
-    out.stop_gradient = True
-    return out
+    return paddle.fluid.layers.range(start, end, step, dtype, name)
 
 
 def _tril_triu_op(helper):
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index f0e1c78f11750..7ec68dc8980a1 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -583,66 +583,69 @@ def t(input, name=None):
     return out
 
 
-def cross(input, other, dim=None):
+def cross(x, y, axis=None, name=None):
     """
 	:alias_main: paddle.cross
 	:alias: paddle.cross,paddle.tensor.cross,paddle.tensor.linalg.cross
 
-    Returns the cross product of vectors in dimension `dim` of the `input` and `other` tensor. 
-    Inputs must have the same shape, and the size of their dim-th dimension should be equla to 3. 
-    If `dim` is not given, it defaults to the first dimension found with the size 3.
+    Computes the cross product between two tensors along an axis.
+    Inputs must have the same shape, and the length of their axes should be equal to 3.
+    If `axis` is not given, it defaults to the first axis found with the length 3.
     
     Args:
-        input (Variable): The first input tensor variable.
-        other (Variable): The second input tensor variable.
-        dim (int): The dimension to take the cross-product in.
+        x (Variable): The first input tensor variable.
+        y (Variable): The second input tensor variable.
+        axis (int, optional): The axis along which to compute the cross product. It defaults to the first axis found with the length 3.
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: A Tensor with same data type as `input`.
+        Variable: A Tensor with same data type as `x`.
         
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.fluid as fluid
+            from paddle.imperative import to_variable
             import numpy as np
 
+            paddle.enable_imperative()
+
             data_x = np.array([[1.0, 1.0, 1.0],
                                [2.0, 2.0, 2.0],
                                [3.0, 3.0, 3.0]])
             data_y = np.array([[1.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0]])
-
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(data_x)
-                y = fluid.dygraph.to_variable(data_y)
-                out_z1 = paddle.cross(x, y)
-                print(out_z1.numpy())
-                #[[-1. -1. -1.]
-                # [ 2.  2.  2.]
-                # [-1. -1. -1.]]
-                out_z2 = paddle.cross(x, y, dim=1)
-                print(out_z2.numpy())
-                #[[0. 0. 0.]
-                # [0. 0. 0.]
-                # [0. 0. 0.]]
+            x = to_variable(data_x)
+            y = to_variable(data_y)
+
+            z1 = paddle.cross(x, y)
+            print(z1.numpy())
+            # [[-1. -1. -1.]
+            #  [ 2.  2.  2.]
+            #  [-1. -1. -1.]]
+
+            z2 = paddle.cross(x, y, axis=1)
+            print(z2.numpy())
+            # [[0. 0. 0.]
+            #  [0. 0. 0.]
+            #  [0. 0. 0.]]
     """
-    helper = LayerHelper("cross", **locals())
     if in_dygraph_mode():
-        if dim:
-            return core.ops.cross(input, other, 'dim', dim)
+        if axis:
+            return core.ops.cross(x, y, 'dim', axis)
         else:
-            return core.ops.cross(input, other)
+            return core.ops.cross(x, y)
 
-    out = helper.create_variable_for_type_inference(input.dtype)
+    helper = LayerHelper("cross", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
     attrs = dict()
-    if dim:
-        attrs['dim'] = dim
+    attrs['dim'] = axis
 
     helper.append_op(
         type='cross',
-        inputs={'X': input,
-                'Y': other},
+        inputs={'X': x,
+                'Y': y},
         outputs={'Out': out},
         attrs=attrs)
     return out
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index a31a85f98602a..979e2973064b0 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -39,6 +39,7 @@
 from ..fluid.layers import scatter_nd  #DEFINE_ALIAS
 from ..fluid.layers import shard_index  #DEFINE_ALIAS
 from ..fluid.layers import unique_with_counts  #DEFINE_ALIAS
+from ..fluid import layers
 
 __all__ = [
     'cast', 'concat', 'expand', 'expand_as', 'flatten', 'gather', 'gather_nd',
@@ -544,83 +545,81 @@ def _get_SectionsTensorList(one_list):
     return outs
 
 
-def squeeze(input, axes, out=None, name=None):
+def squeeze(x, axis=None, name=None):
     """
 	:alias_main: paddle.squeeze
-	:alias: paddle.squeeze,paddle.tensor.squeeze,paddle.tensor.manipulation.squeeze
+	:alias: paddle.squeeze, paddle.tensor.squeeze, paddle.tensor.manipulation.squeeze
 
-    This OP will squeeze single-dimensional entries of input tensor's shape. If axes is provided, will
-    remove the dims by axes, the dims selected by axes should be one. If not provide axes, all dims equal
-    to one will be deleted.
+    This OP will squeeze the dimension(s) of size 1 of input tensor x's shape. 
 
+    If axis is provided, it will remove the dimension(s) by given axis that of size 1. 
+    If the dimension of given axis is not of size 1, the dimension remain unchanged. 
+    If axis is not provided, all dims equal of size 1 will be removed.
 
     .. code-block:: text
 
         Case1:
 
           Input:
-            X.shape = (1, 3, 1, 5)
-            axes = [0]
+            x.shape = [1, 3, 1, 5]  # If axis is not provided, all dims equal of size 1 will be removed.
+            axis = None
           Output:
-            Out.shape = (3, 1, 5)
+            out.shape = [3, 5]
 
         Case2:
 
           Input:
-            X.shape = (1, 3, 1, 5)
-            axes = []
+            x.shape = [1, 3, 1, 5]  # If axis is provided, it will remove the dimension(s) by given axis that of size 1.
+            axis = 0
+          Output:
+            out.shape = [3, 1, 5]
+        
+        Case4:
+
+          Input:
+            x.shape = [1, 3, 1, 5]  # If the dimension of one given axis (3) is not of size 1, the dimension remain unchanged. 
+            axis = [0, 2, 3]
           Output:
-            Out.shape = (3, 5)
+            out.shape = [3, 5]
 
-        Case3:
+        Case4:
 
           Input:
-            X.shape = [1,3,1,5]
-            axes = [-2]
+            x.shape = [1, 3, 1, 5]  # If axis is negative, axis = axis + ndim (number of dimensions in x). 
+            axis = [-2]
           Output:
-            Out.shape = [1,3,5]
+            out.shape = [1, 3, 5]
 
     Args:
-        input (Variable): The input Tensor. Support data type: float32, float64, int8, int32, int64.
-                          axes (list): One integer or List of integers, indicating the dimensions to be squeezed.
-                          Axes range is :math:`[-rank(input), rank(input))`.
-                          If axes is negative, :math:`axes=axes+rank(input)`.
+        input (Tensor): The input Tensor. Support data type: float32, float64, int8, int32, int64.
+        axis (int|list|tuple, optional): An integer or list of integers, indicating the dimensions to be squeezed. Default is None.
+                          The range of axis is :math:`[-ndim(input), ndim(input))`.
+                          If axis is negative, :math:`axis = axis + ndim(input)`.
+                          If axis is None, all the dimensions of input of size 1 will be removed.
         name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
 
     Returns:
-        Variable: Output squeezed Tensor. Data type is same as input Tensor.
+        Tensor: Output squeezed Tensor. Data type is same as input Tensor.
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
-            import paddle.fluid as fluid
 
-            with fluid.dygraph.guard():
-                input_1 = np.random.random([5, 1, 10]).astype("int32")
-                # input is a variable which shape is [5, 1, 10]
-                input = fluid.dygraph.to_variable(input_1)
-
-                output = paddle.squeeze(input, axes=[1])
-                # output.shape [5, 10]
+            paddle.enable_imperative()
+            
+            x = paddle.rand([5, 1, 10])
+            output = paddle.squeeze(x, axis=1)
+            # output.shape [5, 10]
 
     """
+    if axis is None:
+        axis = []
+    elif isinstance(axis, int):
+        axis = [axis]
+    elif isinstance(axis, tuple):
+        axis = list(axis)
 
-    helper = LayerHelper("squeeze", **locals())
-    check_variable_and_dtype(input, 'input',
-                             ['float32', 'float64', 'int8', 'int32', 'int64'],
-                             'squeeze')
-    check_type(axes, 'axes', list, 'squeeze')
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type="squeeze2",
-        inputs={"X": input},
-        attrs={"axes": axes},
-        outputs={"Out": out,
-                 "XShape": x_shape})
-
-    return out
+    return layers.squeeze(x, axis, name)
 
 
 def unsqueeze(input, axes, out=None, name=None):
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 500ab01f7c1fb..72cf76c5c725b 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -31,6 +31,8 @@
 from ..fluid.layers import asin    #DEFINE_ALIAS
 from ..fluid.layers import ceil    #DEFINE_ALIAS
 from ..fluid.layers import cos    #DEFINE_ALIAS
+from ..fluid.layers import sinh    #DEFINE_ALIAS
+from ..fluid.layers import cosh    #DEFINE_ALIAS
 from ..fluid.layers import cumsum    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_add    #DEFINE_ALIAS
 from ..fluid.layers import elementwise_div    #DEFINE_ALIAS
@@ -69,6 +71,7 @@
         'atan',
         'ceil',
         'cos',
+        'cosh',
         'cumsum',
         'elementwise_add',
         'elementwise_div',
@@ -76,7 +79,6 @@
         'elementwise_max',
         'elementwise_min',
         'elementwise_mod',
-        'elementwise_mul',
         'elementwise_pow',
         'elementwise_sub',
         'exp',
@@ -96,6 +98,7 @@
         'scale',
         'sign',
         'sin',
+        'sinh',
         'sqrt',
         'square',
         'stanh',
@@ -107,6 +110,7 @@
         'min',
         'mm',
         'div',
+        'multiply',
         'add',
         'atan',
         'logsumexp',
@@ -171,7 +175,7 @@ def func(x, name=None, out=None):
     .. code-block:: python
 
         import numpy as np
-        
+
         import paddle
         import paddle.fluid as fluid
 
@@ -201,9 +205,9 @@ def pow(input, exponent, out=None, name=None):
     Args:
         input(Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float32`` or ``float64``.
         exponent(float32|Variable): A scalar with type ``float32`` or a ``Tensor`` with shape [1] and type ``float32``.
-        out (Variable, optional):  The Variable that stores results of the operation. 
+        out (Variable, optional):  The Variable that stores results of the operation.
             If out is None, a new Variable will be created to store the results.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property. 
+        name(str, optional): The default value is None. Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
@@ -260,94 +264,6 @@ def pow(input, exponent, out=None, name=None):
     return out
 
 
-def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, out=None, name=None):
-    """
-	:alias_main: paddle.mul
-	:alias: paddle.mul,paddle.tensor.mul,paddle.tensor.math.mul
-
-    Mul Operator.
-    This operator is used to perform matrix multiplication for input $x$ and $y$.
-    The equation is:
-
-    ..  math::
-        Out = x * y
-
-    Both the input $x$ and $y$ can carry the LoD (Level of Details) information, or not. 
-    But the output only shares the LoD information with input $x$.
-
-    Args:
-        x (Variable): The first input Tensor/LoDTensor of mul_op.
-        y (Variable): The second input Tensor/LoDTensor of mul_op.
-        x_num_col_dims (int, optional): The mul_op can take tensors with more than two dimensions as its inputs. 
-            If the input $x$ is a tensor with more than two dimensions, $x$ will be flattened into a two-dimensional 
-            matrix first. The flattening rule is: the first `num_col_dims` will be flattened to form the first 
-            dimension of the final matrix (the height of the matrix), and the rest `rank(x) - num_col_dims` 
-            dimensions are flattened to form the second dimension of the final matrix (the width of the matrix). 
-            As a result, height of the flattened matrix is equal to the product of $x$'s first `x_num_col_dims` dimensions' 
-            sizes, and width of the flattened matrix is equal to the product of $x$'s last `rank(x) - num_col_dims` 
-            dimensions' size. For example, suppose $x$ is a 6-dimensional tensor with the shape [2, 3, 4, 5, 6], 
-            and `x_num_col_dims` = 3. Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default is 1. 
-        y_num_col_dims (int, optional): The mul_op can take tensors with more than two dimensions as its inputs. If the 
-            input $y$ is a tensor with more than two dimensions, $y$ will be flattened into a two-dimensional matrix first. 
-            The attribute `y_num_col_dims` determines how $y$ is flattened. See comments of `x_num_col_dims` for more details. 
-            Default is 1. 
-        out(Variable, optinal): The Variable that stores results of the operation. If out is None, 
-            a new Variable will be created to store the results.
-        name (str, optional): Name of the output. Normally there is no need for user to set this property. 
-            For more information, please refer to :ref:`api_guide_Name`. Default is None. If both of out and name are not None, 
-            the output name will be same as out. 
-
-    Returns:
-        Variable(Tensor/LoDTensor): The output Tensor/LoDTensor of mul op.
-
-    Examples:
-        ..  code-block:: python
-            
-            import paddle
-            import paddle.fluid as fluid
-            dataX = fluid.data(name="dataX", shape=[2, 5], dtype="float32")
-            dataY = fluid.data(name="dataY", shape=[5, 3], dtype="float32")
-            
-            res = fluid.data(name="output", shape=[2, 3], dtype="float32")
-            output = paddle.mul(dataX, dataY,
-                                      x_num_col_dims = 1,
-                                      y_num_col_dims = 1, 
-                                      out=res)
-            
-
-    """
-    inputs = {"X": [x], "Y": [y]}
-    attrs = {"x_num_col_dims": x_num_col_dims, "y_num_col_dims": y_num_col_dims}
-    if in_dygraph_mode():
-        outs = core.ops.mul(inputs, attrs)
-        return outs['Out'][0]
-
-    helper = LayerHelper("mul", **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mul')
-    check_variable_and_dtype(y, 'y', ['float16', 'float32', 'float64'], 'mul')
-
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        check_dtype(
-            out.dtype, out.name,
-            convert_dtype(x.dtype), 'mul',
-            '(The out data type in pow must be the same with input data type.)')
-        if name:
-            warnings.warn(
-                "The output Variable name of the paddle.tensor.pow operation can only be given by parameter out or name.\
-                When parameter out and name are set at the same time, out has a higher priority than name. \
-                Finally, the output Variable name is same as the out name %s"
-                                                                              %
-                out.name,
-                category=UserWarning,
-                stacklevel=2)
-    helper.append_op(
-        type="mul", inputs={"X": x,
-                            "Y": y}, attrs=attrs, outputs={"Out": out})
-    return out
-
-
 __ops__noattr__ = [
     'atan',
     'sin',
@@ -411,9 +327,6 @@ def _elementwise_op(helper):
 
 def add(x, y, alpha=1, out=None, name=None):
     """
-	:alias_main: paddle.add
-	:alias: paddle.add,paddle.tensor.add,paddle.tensor.math.add
-
 Examples:
 
     .. code-block:: python
@@ -556,9 +469,6 @@ def gen_data():
 
 def div(x, y, out=None, name=None):
     """
-	:alias_main: paddle.div
-	:alias: paddle.div,paddle.tensor.div,paddle.tensor.math.div
-
 Examples:
 
     .. code-block:: python
@@ -674,13 +584,54 @@ def gen_data():
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
+def multiply(x, y, axis=-1, name=None):
+    """
+	:alias_main: paddle.multiply
+	:alias: paddle.multiply,paddle.tensor.multiply,paddle.tensor.math.multiply
+
+Examples:
+
+    .. code-block:: python
+
+        import paddle
+        import numpy as np
+
+        paddle.enable_imperative()
+        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
+        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
+        x = paddle.imperative.to_variable(x_data)
+        y = paddle.imperative.to_variable(y_data)
+        res = paddle.multiply(x, y)
+        print(res.numpy()) # [[5, 12], [21, 32]]
+
+        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
+        y_data = np.array([1, 2], dtype=np.float32)
+        x = paddle.imperative.to_variable(x_data)
+        y = paddle.imperative.to_variable(y_data)
+        res = paddle.multiply(x, y, axis=1)
+        print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
+
+    """
+    op_type = 'elementwise_mul'
+    act = None
+    if in_dygraph_mode():
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name=op_type)
+
+    return _elementwise_op(LayerHelper(op_type, **locals()))
+
+
 for func in [
         add,
         div,
+        multiply,
 ]:
-    proto_dict = {'add': 'elementwise_add', 'div': 'elementwise_div'}
+    proto_dict = {'add': 'elementwise_add', 'div': 'elementwise_div', 'multiply': 'elementwise_mul'}
     op_proto = OpProtoHolder.instance().get_op_proto(proto_dict[func.__name__])
     if func.__name__ in ['add']:
+        alias_main = ':alias_main: paddle.%(func)s' % {'func': func.__name__}
+        alias = ':alias: paddle.%(func)s, paddle.tensor.%(func)s, paddle.tensor.math.%(func)s' % {'func': func.__name__}
+
         additional_args_lines = [
             "alpha (int|float, optional): The alpha factor of the input. Default is 1. If alpha is not 1, the equation becomes Out = X + alpha * Y.",
             "out (Variable, optinal): The Variable that stores results of the operation. Default is None. If out is None, \
@@ -692,15 +643,12 @@ def gen_data():
         ]
     else:
         additional_args_lines = [
-            "out (Variable, optinal): The Variable that stores results of the operation. If out is None, \
-            a new Variable will be created to store the results."
-                                                                 ,
             "name (string, optional): Name of the output. \
             Default is None. It's used to print debug info for developers. Details: \
             :ref:`api_guide_Name` "
         ]
 
-    func.__doc__ = _generate_doc_string_(
+    func.__doc__ = alias_main + """\n""" + alias + """\n""" + _generate_doc_string_(
         op_proto,
         additional_args_lines=additional_args_lines,
         skip_attrs_set={"x_data_format", "y_data_format", "axis",
@@ -722,7 +670,7 @@ def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
             Tensor variable with a single element, otherwise must be in the
             range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
             the dimension to reduce is :math:`rank + dim[i]`.
-        dtype(str, optional): The dtype of output tensor. The default value is None, the dtype 
+        dtype(str, optional): The dtype of output tensor. The default value is None, the dtype
             of output is the same as input tensor.
         keep_dim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
@@ -737,7 +685,7 @@ def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
 
     Raises:
         ValueError, the :attr:`dtype` must be float64 or int64.
-    
+
     Examples:
         .. code-block:: python
 
@@ -818,7 +766,7 @@ def elementwise_sum(inputs, name=None):
 	:alias: paddle.elementwise_sum,paddle.tensor.elementwise_sum,paddle.tensor.math.elementwise_sum
 
     ${comment}
-    
+
     Case 1:
     ::
         Input:
@@ -850,13 +798,13 @@ def elementwise_sum(inputs, name=None):
                       [14, 16, 18]]
 
     Args:
-        inputs (Variable|list(Variable)):  A Varaible list. The shape and data type of the list elementsshould be consistent. 
-            Variable can be multi-dimensional Tensoror LoDTensor, and data types can be: float32, float64, int32, int64. 
+        inputs (Variable|list(Variable)):  A Varaible list. The shape and data type of the list elementsshould be consistent.
+            Variable can be multi-dimensional Tensoror LoDTensor, and data types can be: float32, float64, int32, int64.
         name(str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-        Variable: the sum of input :math:`inputs` . its shape and data types are consistent with :math:`inputs` . 
+        Variable: the sum of input :math:`inputs` . its shape and data types are consistent with :math:`inputs` .
 
     Examples:
         .. code-block:: python
@@ -882,8 +830,8 @@ def elementwise_sum(inputs, name=None):
 
             # the sum of input0 and input1 is 2-D Tensor with shape [2,3].
             # dtype is the corresponding C++ data type, which may vary in different environments.
-            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t, 
-            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux, 
+            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t,
+            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux,
             #       and '__int64' on Windows. They both represent 64-bit integer variables.
     """
 
@@ -928,7 +876,7 @@ def mm(input, mat2, out=None, name=None):
     Args:
         x (Variable): The input variable which is a Tensor or LoDTensor.
         mat2 (Variable): The input variable which is a Tensor or LoDTensor.
-        out(Variable, optional): Optional output which can be any created 
+        out(Variable, optional): Optional output which can be any created
             Variable that meets the requirements to store the result of operation.
             if out is None, a new Varibale will be create to store the result.
         name(str, optional): The default value is None. Normally there is no need for
@@ -1060,7 +1008,7 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
 
             place =  fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
             exe = fluid.Executor(place)
-            results = exe.run(fluid.default_main_program(), 
+            results = exe.run(fluid.default_main_program(),
                               fetch_list=[out], feed={"input": data_input, 'x': data_x, "y": data_y})
             print( np.array(results[0]) )
             # [[10.5 10.5]
@@ -1169,7 +1117,7 @@ def inverse(input, out=None, name=None):
             dimensions should be equal. When the number of dimensions is
             greater than 2, it is treated as batches of square matrix. The data
             type can be float32 and float64.
-        out (Variable, optional): Optional output which can be any created 
+        out (Variable, optional): Optional output which can be any created
             Variable that meets the requirements to store the result of operation.
             If out is None, a new Varibale will be create to store the result.
         name (str, optional): The default value is None. Normally there is no need for
@@ -1192,7 +1140,7 @@ def inverse(input, out=None, name=None):
             # example for static graph
             input = fluid.data("input", shape=[2, 2], dtype="float32")
             out = paddle.inverse(input)
-        
+
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             results = exe.run(feed={"input": mat_np },
@@ -1249,10 +1197,10 @@ def max(input, dim=None, keep_dim=False, out=None, name=None):
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true, default
             value is False.
-        out(Variable, optional): Optional output which can be any created 
+        out(Variable, optional): Optional output which can be any created
             Variable that meets the requirements to store the result of operation.
             if out is None, a new Varibale will be create to store the result.
-        name(str, optional): The default value is None.  Normally there is no need for 
+        name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
@@ -1329,10 +1277,10 @@ def min(input, dim=None, keep_dim=False, out=None, name=None):
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true, default
             value is False.
-        out(Variable, optional): Optional output which can be any created 
+        out(Variable, optional): Optional output which can be any created
             Variable that meets the requirements to store the result of operation.
             if out is None, a new Varibale will be create to store the result.
-        name(str, optional): The default value is None.  Normally there is no need for 
+        name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
@@ -1389,7 +1337,7 @@ def min(input, dim=None, keep_dim=False, out=None, name=None):
     return out
 
 
-def log1p(x, out=None, name=None):
+def log1p(x, name=None):
     """
 	:alias_main: paddle.log1p
 	:alias: paddle.log1p,paddle.tensor.log1p,paddle.tensor.math.log1p
@@ -1399,9 +1347,6 @@ def log1p(x, out=None, name=None):
         Out = \\ln(x+1)
     Args:
         x (Variable): Input LoDTensor or Tensor. Must be one of the following types: float32, float64.
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result.
         name(str, optional): The default value is None.  Normally there is no need for 
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
     Returns:
@@ -1430,11 +1375,11 @@ def log1p(x, out=None, name=None):
     inputs = {'X': [x]}
     helper = LayerHelper('log1p', **locals())
     dtype = helper.input_dtype(input_param_name='x')
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(type="log1p", inputs={"X": x}, outputs={"Out": out})
     return out
 
+
 def addcmul(input, tensor1, tensor2, value=1.0, out=None, name=None):
     """
 	:alias_main: paddle.addcmul
@@ -1496,17 +1441,17 @@ def clamp(input, min=None, max=None, output=None, name=None):
 
     .. math::
 
-        Out = MIN(MAX(x, min), max) 
+        Out = MIN(MAX(x, min), max)
 
     Args:
-        input (Variable): An input N-D Tensor or LoDTensor 
-            with data type float32, float64.   
+        input (Variable): An input N-D Tensor or LoDTensor
+            with data type float32, float64.
         min (float32|Variable): The lower bound with type ``float32`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         max (float32|Variable): The upper bound with type ``float32`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        output (Variable, optional): A tensor or LoDTensor. If :attr:`output` is None, 
-            a new tensor will be created as :attr:`output`. Default: None. 
+        output (Variable, optional): A tensor or LoDTensor. If :attr:`output` is None,
+            a new tensor will be created as :attr:`output`. Default: None.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
@@ -1578,11 +1523,11 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
 	:alias: paddle.trace,paddle.tensor.trace,paddle.tensor.math.trace
 
     This OP computes the sum along diagonals of the input tensor x.
-    
-    If ``x`` is 2D, returns the sum of diagonal. 
+
+    If ``x`` is 2D, returns the sum of diagonal.
 
     If ``x`` has larger dimensions, then returns an tensor of diagonals sum, diagonals be taken from
-    the 2D planes specified by axis1 and axis2. By default, the 2D planes formed by the first and second axes 
+    the 2D planes specified by axis1 and axis2. By default, the 2D planes formed by the first and second axes
     of the input tensor x.
 
     The argument ``offset`` determines where diagonals are taken from input tensor x:
@@ -1590,7 +1535,7 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
     - If offset = 0, it is the main diagonal.
     - If offset > 0, it is above the main diagonal.
     - If offset < 0, it is below the main diagonal.
-    
+
     Args:
         x(Variable): The input tensor x. Must be at least 2-dimensional. The input data type should be float32, float64, int32, int64.
         offset(int, optional): Which diagonals in input tensor x will be taken. Default: 0 (main diagonals).
@@ -1606,11 +1551,11 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
 
             import paddle
             import numpy as np
-            
+
             case1 = np.random.randn(2, 3).astype('float32')
             case2 = np.random.randn(3, 10, 10).astype('float32')
             case3 = np.random.randn(3, 10, 5, 10).astype('float32')
-            
+
             paddle.enable_imperative()
 
             case1 = paddle.imperative.to_variable(case1)
@@ -1674,17 +1619,17 @@ def kron(x, y, out=None, name=None):
 ${comment}
 
     Args:
-        x (Variable): the fist operand of kron op, data type: float16, float32, 
+        x (Variable): the fist operand of kron op, data type: float16, float32,
             float64, int32 or int64.
-        y (Variable): the second operand of kron op, data type: float16, 
-            float32, float64, int32 or int64. Its data type should be the same 
+        y (Variable): the second operand of kron op, data type: float16,
+            float32, float64, int32 or int64. Its data type should be the same
             with x.
-        out (Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of 
-            operation. If out is None, a new Varibale will be create to store 
+        out (Variable, optional): Optional output which can be any created
+            Variable that meets the requirements to store the result of
+            operation. If out is None, a new Varibale will be create to store
             the result. Defaults to None.
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property.  For more information, please 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property.  For more information, please
             refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1692,7 +1637,7 @@ def kron(x, y, out=None, name=None):
 
     Examples:
         .. code-block:: python
-        
+
           import paddle
           from paddle import fluid
           import paddle.fluid.dygraph as dg
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 8eabaa84ce3d3..eac99163e0579 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -37,172 +37,115 @@
 ]
 
 
-def randint(low,
-            high=None,
-            shape=None,
-            out=None,
-            dtype=None,
-            device=None,
-            stop_gradient=False,
-            seed=0,
-            name=None):
+def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
 	:alias_main: paddle.randint
 	:alias: paddle.randint,paddle.tensor.randint,paddle.tensor.random.randint
 
-    This function returns a Tensor filled with random integers from the "discrete uniform" distribution of the
-    specified data type in the interval [low, high). If high is None (the default), then results are from [0, low).
+    This function returns a Tensor filled with random integers from the
+    "discrete uniform" distribution of the specified data type in the interval
+    [low, high). If high is None (the default), then results are from [0, low).
 
     Args:
-        low (int): The lower bound on the range of random values to generate, the low is included in the range.
-            (unless high=None, in which case this parameter is one above the highest such integer).
-        high (int, optional): The upper bound on the range of random values to generate, the high is excluded 
-            in the range. Default None(see above for behavior if high=None).
-        shape (list|tuple|Variable, optional): The shape of the output Tensor,  if the shape is a list or tuple, 
-                                     its elements can be an integer
-                                     or a Tensor with the shape [1], and the type of the Tensor must be int32 or int64. 
-                                     If the shape is a Variable, it is a 1-D Tensor, and the type of the Tensor must be 
-                                     int32 or int64. Default is None, in which case the shape is [1].
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output Tensor
-            which can be int32, int64, if dytpe is `None`, the data
-            type of created Tensor is `int64`
-        device(str, optional): This parameter specifies that the Tensor is created 
-            on the GPU or CPU.
-        stop_gradient(bool, optional): Indicating if we stop gradient from current(out) Variable,
-            default value is False.
-        seed (int, optional): Random seed used for permute samples. If seed is 
-            equal to 0, it means use a seed generated by the system. Note that 
-            if seed is not 0, this operator will always generate the same random 
-            permutation every time. Default: 0.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this
-            property.  For more information, please refer to :ref:`api_guide_Name`.
+        low (int): The lower bound on the range of random values to generate,
+            the low is included in the range.(unless high=None, in which case
+            this parameter is one above the highest such integer). Default is 0.
+        high (int, optional): The upper bound on the range of random values to
+            generate, the high is excluded in the range. Default is None(see
+            above for behavior if high=None).
+        shape (list|tuple|Variable, optional): The shape of the output Tensor,
+            if the shape is a list or tuple, its elements can be an integer or
+            a Tensor with the shape [1], and the type of the Tensor must be
+            int32 or int64. If the shape is a Variable, it is a 1-D Tensor,
+            and the type of the Tensor must be int32 or int64. Default is None.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the
+            output Tensor which can be int32, int64. If dtype is `None`, the
+            data type of created Tensor is `int64`
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property.  For more information, please
+            refer to :ref:`api_guide_Name`.
 
     Returns: 
         Variable: A Tensor of the specified shape filled with random integers.
 
     Raises:
-        TypeError: Randint's low must less then high.
+        TypeError: If shape's type is not list, tuple or Variable.
+        TypeError: If dtype is not int32 or int64.
+        ValueError: If low is not large then high; If low is 0, and high is None.
 
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
-
-            # example 1:
-            # attr shape is a list which doesn't contain tensor Variable.
-            result_1 = paddle.randint(low=-5, high=5, shape=[3, 4], dtype="int64")
-
-            # example 2:
-            # attr shape is a list which contains tensor Variable.
-            dim_1 = fluid.layers.fill_constant([1],"int64",3)
-            dim_2 = fluid.layers.fill_constant([1],"int32",5)
-            result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
-
-            # example 3:
-            # attr shape is a Variable, the data type must be int64 or int32.
-            var_shape = fluid.data(name='var_shape', shape=[2], dtype="int64")
-            result_3 = paddle.randint(low=-5, high=5, shape=var_shape, dtype="int32")
-            var_shape_int32 = fluid.data(name='var_shape_int32', shape=[2], dtype="int32")
-            result_4 = paddle.randint(low=-5, high=5, shape=var_shape_int32, dtype="int64")
-
-            # example 4:
-            # Input only one parameter
-            # low=0, high=10, shape=[1], dtype='int64'
-            result_4 = paddle.randint(10)
-     """
-
-    def get_new_shape_tensor(list_shape):
-        new_shape_tensor = []
-        for dim in list_shape:
-            if isinstance(dim, Variable):
-                dim.stop_gradient = True
-                new_shape_tensor.append(dim)
-            else:
-                assert isinstance(dim, int) or isinstance(dim, long)
-                temp_out = helper.create_variable_for_type_inference('int64')
-                fill_constant([1], 'int64', dim, force_cpu=True, out=temp_out)
-                new_shape_tensor.append(temp_out)
-        return new_shape_tensor
-
-    def get_attr_shape(list_shape):
-        unk_dim_idx = -1
-        attrs_shape = []
-        for dim_idx, dim_size in enumerate(list_shape):
-            if isinstance(dim_size, Variable):
-                attrs_shape.append(-1)
-            else:
-                attrs_shape.append(dim_size)
-                assert dim_size > 0, (
-                    "Each dimension size given in shape must not be negative "
-                    "except one unknown dimension.")
-        return attrs_shape
-
-    if dtype is None:
-        dtype = 'int64'
-    check_dtype(dtype, 'dtype', ['int32', 'int64'], 'randint')
-
-    inputs = dict()
-    attrs = dict()
-
-    if shape is None:
-        shape = [1]
-        assert len(shape) > 0, ("The size of argument(shape) can't be zero.")
+        import paddle
+        import numpy as np
 
-    helper = LayerHelper("randint", **locals())
+        paddle.enable_imperative()
 
-    if in_dygraph_mode():
-        attrs['shape'] = shape
-    else:
-        if isinstance(shape, Variable):
-            shape.stop_gradient = True
-            inputs["ShapeTensor"] = shape
-        elif isinstance(shape, (list, tuple)):
-            assert len(shape) > 0, (
-                "The size of argument(shape) can't be zero.")
-            if utils._contain_var(shape):
-                inputs['ShapeTensorList'] = get_new_shape_tensor(shape)
-            else:
-                attrs["shape"] = get_attr_shape(shape)
-    check_type(shape, 'shape', (list, tuple, Variable), 'randint')
+        # example 1:
+        # attr shape is a list which doesn't contain tensor Variable.
+        result_1 = paddle.randint(low=-5, high=5, shape=[3])
+        # [0 -3 2]
+
+        # example 2:
+        # attr shape is a list which contains tensor Variable.
+        dim_1 = paddle.fill_constant([1],"int64",2)
+        dim_2 = paddle.fill_constant([1],"int32",3)
+        result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
+        print(result_2.numpy())
+        # [[ 0 -1 -3]
+        #  [ 4 -2  0]]
+
+        # example 3:
+        # attr shape is a Variable
+        var_shape = paddle.imperative.to_variable(np.array([3]))
+        result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
+        # [-2 2 3]
+
+        # example 4:
+        # data type is int32
+        result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
+        # [-5 4 -4]
+
+        # example 5:
+        # Input only one parameter
+        # low=0, high=10, shape=[1], dtype='int64'
+        result_5 = paddle.randint(10)
+        # [7]
 
+    """
     if high is None:
+        if low <= 0:
+            raise ValueError(
+                "If high is None, low must be greater than 0, but received low = {0}.".
+                format(low))
         high = low
         low = 0
-    attrs['low'] = low
-    attrs['high'] = high
-    attrs['seed'] = seed
-    if (low >= high):
+    if dtype is None:
+        dtype = 'int64'
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if in_dygraph_mode():
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.randint('shape', shape, 'low', low, 'high', high,
+                                'seed', 0, 'dtype', dtype)
+
+    check_type(shape, 'shape', (list, tuple, Variable), 'randint')
+    check_dtype(dtype, 'dtype', ['int32', 'int64'], 'randint')
+    if low >= high:
         raise ValueError(
             "randint's low must less then high, but received low = {0}, "
             "high = {1}".format(low, high))
 
-    if out is None:
-        if name is None:
-            out = helper.create_variable_for_type_inference(dtype=dtype)
-        else:
-            out = helper.create_variable(
-                name=name, dtype=dtype, persistable=False)
-    else:
-        check_dtype(dtype, 'dtype',
-                    convert_dtype(out.dtype), 'randint',
-                    "(The dtype in randint must be the same with out's dtype.)")
-    attrs['dtype'] = out.dtype
-    out.stop_gradient = stop_gradient
-
-    if device is None:
-        helper.append_op(
-            type='randint', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    else:
-        with device_guard(device):
-            helper.append_op(
-                type='randint',
-                inputs=inputs,
-                outputs={'Out': out},
-                attrs=attrs)
+    inputs = dict()
+    attrs = {'low': low, 'high': high, 'seed': 0, 'dtype': dtype}
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='randint')
+
+    helper = LayerHelper("randint", **locals())
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+    helper.append_op(
+        type='randint', inputs=inputs, outputs={'Out': out}, attrs=attrs)
     return out
 
 
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 59b8f1e765b26..d8874e47c3579 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -63,9 +63,9 @@ def argmax(input, axis=None, dtype=None, out=None, keepdims=False, name=None):
             Variable that meets the requirements to store the result of operation.
             if out is None, a new Varibale will be create to store the result. Defalut is None.
         keepdims(bool, optional): Keep the axis that do the select max.
-        name(str, optional): The name of output variable, normally there is no need for user to set this this property. 
-            Default value is None, the framework set the name of output variable.  
-
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
 
     Returns:
         Variable: A Tensor with data type int64.
@@ -135,7 +135,7 @@ def argmax(input, axis=None, dtype=None, out=None, keepdims=False, name=None):
     return out
 
 
-def index_select(input, index, dim=0):
+def index_select(x, index, axis=0, name=None):
     """
 	:alias_main: paddle.index_select
 	:alias: paddle.index_select,paddle.tensor.index_select,paddle.tensor.search.index_select
@@ -146,56 +146,60 @@ def index_select(input, index, dim=0):
     size as the length of `index`; other dimensions have the same size as in the `input` tensor. 
 
     Args:
-        input (Variable): The input tensor variable.
-        index (Variable): The 1-D tensor containing the indices to index.
-        dim (int): The dimension in which we index.
+        x (Variable): The input tensor variable.The dtype of x can be one of float32, float64, int32, int64.
+        index (Variable): The 1-D tensor containing the indices to index.the dtype of index can be int32 or int64.
+        axis (int, optional): The dimension in which we index. Default: if None, the axis is 0.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
 
     Returns:
         Variable: A Tensor with same data type as `input`.
+    
+    Raises:
+        TypeError: x must be a Variable and the dtype of x must be one of  float32, float64, int32 and int64.
+        TypeError: index must be a Variable adn the dtype of index must be int32 or int64.
 
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.fluid as fluid
             import numpy as np
 
+            paddle.enable_imperative()  # Now we are in imperative mode
             data = np.array([[1.0, 2.0, 3.0, 4.0],
                              [5.0, 6.0, 7.0, 8.0],
                              [9.0, 10.0, 11.0, 12.0]])
             data_index = np.array([0, 1, 1]).astype('int32')
 
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(data)
-                index = fluid.dygraph.to_variable(data_index)
-                out_z1 = paddle.index_select(x, index)
-                print(out_z1.numpy())
-                #[[1. 2. 3. 4.]
-                # [5. 6. 7. 8.]
-                # [5. 6. 7. 8.]]
-                out_z2 = paddle.index_select(x, index, dim=1)
-                print(out_z2.numpy())
-                #[[ 1.  2.  2.]
-                # [ 5.  6.  6.]
-                # [ 9. 10. 10.]]
+            x = paddle.imperative.to_variable(data)
+            index = paddle.imperative.to_variable(data_index)
+            out_z1 = paddle.index_select(x=x, index=index)
+            #[[1. 2. 3. 4.]
+            # [5. 6. 7. 8.]
+            # [5. 6. 7. 8.]]
+            out_z2 = paddle.index_select(x=x, index=index, axis=1)
+            #[[ 1.  2.  2.]
+            # [ 5.  6.  6.]
+            # [ 9. 10. 10.]]
     """
-    helper = LayerHelper("index_select", **locals())
+
     if in_dygraph_mode():
-        return core.ops.index_select(input, index, 'dim', dim)
+        return core.ops.index_select(x, index, 'dim', axis)
 
-    check_variable_and_dtype(input, 'x',
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'paddle.tensor.search.index_sample')
+    helper = LayerHelper("index_select", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
+                             'paddle.tensor.search.index_select')
     check_variable_and_dtype(index, 'index', ['int32', 'int64'],
-                             'paddle.tensor.search.index_sample')
+                             'paddle.tensor.search.index_select')
 
-    out = helper.create_variable_for_type_inference(input.dtype)
+    out = helper.create_variable_for_type_inference(x.dtype)
 
     helper.append_op(
         type='index_select',
-        inputs={'X': input,
+        inputs={'X': x,
                 'Index': index},
         outputs={'Out': out},
-        attrs={'dim': dim})
+        attrs={'dim': axis})
     return out
 
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 31a089d727276..10325e096fb76 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -145,10 +145,10 @@ packages=['paddle',
           'paddle.incubate.complex.tensor',
           'paddle.fleet',
           'paddle.fleet.base',
-          'paddle.fleet.collective',
+          'paddle.fleet.meta_optimizers',
+          'paddle.fleet.runtime',
           'paddle.fleet.dataset',
           'paddle.fleet.metrics',
-          'paddle.fleet.parameter_server',
           'paddle.fleet.proto',
           'paddle.framework',
           'paddle.fluid',
@@ -272,6 +272,10 @@ else:
         shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path)
         package_data['paddle.libs'] += ['openblas' + ext_name]
 
+if '${WITH_LITE}' == 'ON':
+    shutil.copy('${LITE_SHARED_LIB}', libs_path)
+    package_data['paddle.libs']+=['libpaddle_full_api_shared' + ext_name]
+
 if '${WITH_PSLIB}' == 'ON':
     shutil.copy('${PSLIB_LIB}', libs_path)
     if os.path.exists('${PSLIB_VERSION_PY}'):
@@ -320,6 +324,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
         else:
             command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
+        # The dynamic library compiled under aarch64 is greater than 64M,
+        # and an oversize error will be reported when using patchelf.
         if platform.machine() != 'aarch64':
           if os.system(command) != 0:
               raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 3bdcc4cad1ce3..16b03936e9390 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -19,6 +19,7 @@ API_FILES=("CMakeLists.txt"
            "paddle/fluid/framework/ir/node.h"
            "paddle/fluid/framework/ir/graph.h"
            "paddle/fluid/framework/framework.proto"
+	   "python/paddle/fleet/__init__.py"
            "python/requirements.txt"
            "python/paddle/fluid/__init__.py"
            "python/paddle/fluid/compiler.py"
@@ -80,10 +81,18 @@ if [ "$api_doc_spec_diff" != "" ]; then
     check_approval 1 2870059 29231 27208573 28379894 11935832
 fi
 
+api_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5  ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` 
+if [ "$api_spec_diff" != "" ]; then
+    echo_line="You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n"
+    echo_line="${echo_line}For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/paddle_api_development_manual.md]\n"
+    echo_line="${echo_line}Related APIs: ${api_spec_diff}"
+    check_approval 1 6888866 43953930
+fi
+
 op_type_spec_diff=`python ${PADDLE_ROOT}/tools/check_op_register_type.py ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_PR.spec`
 if [ "$op_type_spec_diff" != "" ]; then
     echo_line="You must have one RD (Aurelius84 (Recommend) or liym27 or zhhsplendid)approval for the data_type registration of new operator. More data_type of new operator should be registered in your PR. Please make sure that both float/double (or int/int64_t) have been registered.\n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/Data-types-of-generic-Op-must-be-fully-registered].\n"
-    check_approval 1 9301846 33742067 7913861
+    check_approval 1 9j301846 33742067 7913861
 fi
 
 op_desc_diff=`python ${PADDLE_ROOT}/tools/check_op_desc.py ${PADDLE_ROOT}/paddle/fluid/OP_DESC_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/OP_DESC_PR.spec`
@@ -97,7 +106,7 @@ for API_FILE in ${API_FILES[*]}; do
   if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
       # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
       # You can use http://caius.github.io/github_id/ to find Github user id.
-      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059,Boyan-Liu 31623103, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, JepsonWong 16509038, Shixiaowei02 39303645, Heeenrrry 28379894.
+      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059,Boyan-Liu 31623103, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, JepsonWong 16509038, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676. Dong Daxiang 35550832.
       if [ "${API_FILE}" == "CMakeLists.txt" ];then
           echo_line="You must have one RD (luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
           check_approval 1 6836917 46782768
@@ -143,6 +152,9 @@ for API_FILE in ${API_FILES[*]}; do
       elif [ "${API_FILE}" == "tools/wlist.json" ];then
         echo_line="You must have one RD (lelelelelez (Recommend) or luotao1) approval for the api whitelist for the tools/wlist.json.\n"
         check_approval 1 22937122 6836917
+      elif [ "${API_FILE}" == "python/paddle/fleet/__init__.py" ]; then
+	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
+	check_approval 1 35550832 38231817
       else
           echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1,sneaxiy) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
           check_approval 1 3048612 46782768 12538138 6836917 32832641
@@ -281,6 +293,21 @@ if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     fi
 fi
 
+RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|grep -E "CMakeLists.txt"||true`
+if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
+    do
+        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|PROPERTIES[[:space:]]+TIMEOUT" || true`
+	if [[ ${RUNTYPE_ADD} != "" ]];then
+	    RUNTYPE_ADD_LINES="${RUNTYPE_ADD_LINES}\n${CMAKELISTS_FILE}\n${RUNTYPE_ADD}\n"
+	fi
+    done
+    if [[ ${RUNTYPE_ADD_LINES} != "" ]];then
+        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE to EXCLUSIVE or DIST, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
+	check_approval 1 32428676 45041955
+    fi
+fi
+
 DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_DEV.spec
 PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_PR.spec
 ADDED_OP_USE_DEFAULT_GRAD_MAKER=`python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py ${DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC} ${PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC}` 
diff --git a/tools/check_api_source_without_core_ops.py b/tools/check_api_source_without_core_ops.py
new file mode 100644
index 0000000000000..d04cbcd160b91
--- /dev/null
+++ b/tools/check_api_source_without_core_ops.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import difflib
+import sys
+import importlib
+import os
+import count_api_without_core_ops
+
+with open(sys.argv[1], 'r') as f:
+    origin = f.read()
+    origin = origin.splitlines()
+
+with open(sys.argv[2], 'r') as f:
+    new = f.read()
+    new = new.splitlines()
+
+differ = difflib.Differ()
+result = differ.compare(origin, new)
+
+api_with_ops, api_without_ops = count_api_without_core_ops.get_apis_with_and_without_core_ops(
+    ['paddle'])
+
+error = False
+# get all diff apis
+# check if the changed api's source code contains append_op but not core.ops
+diffs = []
+for each_diff in result:
+    if each_diff[0] == '+':
+        api_name = each_diff.split(' ')[1].strip()
+        if api_name in api_without_ops and api_name.find('sequence') == -1:
+            error = True
+            diffs += [api_name]
+
+if error:
+    for each_diff in diffs:
+        print(each_diff)
diff --git a/tools/count_api_without_core_ops.py b/tools/count_api_without_core_ops.py
new file mode 100644
index 0000000000000..99e84074158ad
--- /dev/null
+++ b/tools/count_api_without_core_ops.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import importlib
+import inspect
+import collections
+import sys
+import pydoc
+import hashlib
+import six
+import functools
+
+__all__ = ['get_apis_with_and_without_core_ops', ]
+
+# APIs that should not be printed into API.spec 
+omitted_list = [
+    "paddle.fluid.LoDTensor.set",  # Do not know why it should be omitted
+    "paddle.fluid.io.ComposeNotAligned",
+    "paddle.fluid.io.ComposeNotAligned.__init__",
+]
+
+
+def md5(doc):
+    hash = hashlib.md5()
+    hash.update(str(doc).encode('utf-8'))
+    return hash.hexdigest()
+
+
+def split_with_and_without_core_ops(member, cur_name):
+    if cur_name in omitted_list:
+        return
+
+    if inspect.isclass(member):
+        pass
+    else:
+        try:
+            source = inspect.getsource(member)
+            if source.find('append_op') != -1:
+                if source.find('core.ops') != -1:
+                    api_with_ops.append(cur_name)
+                else:
+                    api_without_ops.append(cur_name)
+        except:
+            # If getsource failed (pybind API or function inherit from father class), just skip
+            pass
+
+
+def get_md5_of_func(member, cur_name):
+    if cur_name in omitted_list:
+        return
+
+    doc_md5 = md5(member.__doc__)
+
+    if inspect.isclass(member):
+        pass
+    else:
+        try:
+            source = inspect.getsource(member)
+            func_dict[cur_name] = md5(source)
+        except:
+            # If getsource failed (pybind API or function inherit from father class), just skip
+            pass
+
+
+def visit_member(parent_name, member, func):
+    cur_name = ".".join([parent_name, member.__name__])
+    if inspect.isclass(member):
+        func(member, cur_name)
+        for name, value in inspect.getmembers(member):
+            if hasattr(value, '__name__') and (not name.startswith("_") or
+                                               name == "__init__"):
+                visit_member(cur_name, value, func)
+    elif inspect.ismethoddescriptor(member):
+        return
+    elif callable(member):
+        func(member, cur_name)
+    elif inspect.isgetsetdescriptor(member):
+        return
+    else:
+        raise RuntimeError("Unsupported generate signature of member, type {0}".
+                           format(str(type(member))))
+
+
+def is_primitive(instance):
+    int_types = (int, long) if six.PY2 else (int, )
+    pritimitive_types = int_types + (float, str)
+    if isinstance(instance, pritimitive_types):
+        return True
+    elif isinstance(instance, (list, tuple, set)):
+        for obj in instance:
+            if not is_primitive(obj):
+                return False
+
+        return True
+    else:
+        return False
+
+
+def visit_all_module(mod, visited, func):
+    mod_name = mod.__name__
+    if mod_name != 'paddle' and not mod_name.startswith('paddle.'):
+        return
+
+    if mod_name.startswith('paddle.fluid.core'):
+        return
+
+    if mod in visited:
+        return
+
+    visited.add(mod)
+
+    for member_name in (
+            name
+            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
+            if not name.startswith("_")):
+        instance = getattr(mod, member_name, None)
+        if instance is None:
+            continue
+
+        if is_primitive(instance):
+            continue
+
+        if not hasattr(instance, "__name__"):
+            continue
+
+        if inspect.ismodule(instance):
+            visit_all_module(instance, visited, func)
+        else:
+            visit_member(mod.__name__, instance, func)
+
+
+def get_apis_with_and_without_core_ops(modules):
+    global api_with_ops, api_without_ops
+    api_with_ops = []
+    api_without_ops = []
+    for m in modules:
+        visit_all_module(
+            importlib.import_module(m), set(), split_with_and_without_core_ops)
+    return api_with_ops, api_without_ops
+
+
+def get_api_source_desc(modules):
+    global func_dict
+    func_dict = collections.OrderedDict()
+    for m in modules:
+        visit_all_module(importlib.import_module(m), set(), get_md5_of_func)
+    return func_dict
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        modules = sys.argv[2].split(",")
+        if sys.argv[1] == '-c':
+            api_with_ops, api_without_ops = get_apis_with_and_without_core_ops(
+                modules)
+
+            print('api_with_ops:', len(api_with_ops))
+            print('\n'.join(api_with_ops))
+            print('\n==============\n')
+            print('api_without_ops:', len(api_without_ops))
+            print('\n'.join(api_without_ops))
+
+        if sys.argv[1] == '-p':
+            func_dict = get_api_source_desc(modules)
+            for name in func_dict:
+                print(name, func_dict[name])
+
+    else:
+        print("""Usage: 
+            1. Count and list all operator-raleated APIs that contains append_op but not core.ops.xx. 
+                python ./count_api_without_core_ops.py -c paddle
+            2. Print api and the md5 of source code of the api.
+                python ./count_api_without_core_ops.py -p paddle
+            """)
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
new file mode 100644
index 0000000000000..bb620e6822aff
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.centos
@@ -0,0 +1,81 @@
+# NOTE The manylinux1 policy mandates CentOS-5. We replace it with CentOS-6 in
+# order to satisfy the build of capnproto library (a nupic.core dependency),
+# which requires some headers and symbols not present on CentOS-5 (e.g.,
+# signalfd.h, pipe2, O_NONBLOCK, SOCK_NONBLOCK, etc.). See
+# https://github.com/sandstorm-io/capnproto/issues/350.
+FROM nvidia/cuda:<baseimg>
+MAINTAINER Numenta, based on the ManyLinux project
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
+
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel
+COPY build_scripts /build_scripts
+RUN bash build_scripts/build.sh
+RUN bash build_scripts/install_nccl2.sh && \
+    bash build_scripts/install_trt.sh  
+RUN rm -rf build_scripts
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# for paddle
+RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+
+
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
+
+# protobuf 3.6.1
+RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz && \
+    tar xzf protobuf-cpp-3.6.1.tar.gz && \
+    cd protobuf-3.6.1 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.6.1.tar.gz
+
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install -r /root/requirements.txt && \
+    go get github.com/Masterminds/glide && \
+    rm -rf /root/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+
+RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
+    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
+
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
+# gcc4.8 TRT
+RUN mkdir -p /opt/compiler && cd /opt/compiler && \
+    wget -q https://paddle-ci.gz.bcebos.com/gcc-4.8.2.tar.gz && \
+    tar xf gcc-4.8.2.tar.gz && rm -f gcc-4.8.2.tar.gz
+
+CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
new file mode 100644
index 0000000000000..f424d676f70b1
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -0,0 +1,222 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+FROM nvidia/cuda:<baseimg>
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+
+ENV HOME /root
+# Add bash enhancements
+COPY paddle/scripts/docker/root/ /root/
+
+# Prepare packages for Python
+RUN apt-get update && \
+    apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+    xz-utils tk-dev libffi-dev liblzma-dev
+
+RUN apt-get update && \
+    apt-get install -y --allow-downgrades --allow-change-held-packages \
+    patchelf git python-pip python-dev python-opencv openssh-server bison \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
+    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
+    python-matplotlib \
+    automake locales clang-format swig  \
+    liblapack-dev liblapacke-dev \
+    net-tools libtool module-init-tools && \
+    apt-get clean -y
+
+# Downgrade gcc&&g++
+<install_gcc>
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+# Install Python3.6
+RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
+    tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
+    ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
+    wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
+    tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
+
+# Install Python3.7
+RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
+    tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
+
+# Install Python3.8
+RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
+    tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
+
+# Install Python3.5
+RUN wget -q https://www.python.org/ftp/python/3.5.1/Python-3.5.1.tgz && \
+    tar -xzf Python-3.5.1.tgz && cd Python-3.5.1 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.5.1 --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
+ENV PATH=/usr/local/python3.5.1/include:${PATH}
+ENV PATH=/usr/local/python3.5.1/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/python3.5.1/lib:${LD_LIBRARY_PATH}
+ENV CPLUS_INCLUDE_PATH=/usr/local/python3.5.1/include/python3.5:$CPLUS_INCLUDE_PATH
+RUN ln -sf /usr/local/python3.5.1/bin/python3.5 /usr/local/bin/python3 && ln -sf /usr/local/python3.5.1/bin/python3.5 /usr/bin/python3
+
+RUN rm -r /root/python_build
+
+# Install Python2.7.15 to replace original python
+WORKDIR /home
+ENV version=2.7.15
+RUN wget https://www.python.org/ftp/python/$version/Python-$version.tgz && tar -xvf Python-$version.tgz
+WORKDIR /home/Python-$version
+RUN ./configure --enable-unicode=ucs4 --enable-shared CFLAGS=-fPIC --prefix=/usr/local/python2.7.15 && make && make install
+
+RUN echo "export PATH=/usr/local/python2.7.15/include:${PATH}" >> ~/.bashrc && echo "export PATH=/usr/local/python2.7.15/bin:${PATH}" >> ~/.bashrc && echo "export LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH}" >> ~/.bashrc && echo "export CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH" >> ~/.bashrc
+ENV PATH=/usr/local/python2.7.15/include:${PATH}
+ENV PATH=/usr/local/python2.7.15/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH}
+ENV CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH
+RUN mv /usr/bin/python /usr/bin/python.bak && ln -s /usr/local/python2.7.15/bin/python2.7 /usr/local/bin/python && ln -s /usr/local/python2.7.15/bin/python2.7 /usr/bin/python
+
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip && apt-get -y install unzip && unzip setuptools-40.6.2.zip
+WORKDIR /home/setuptools-40.6.2
+RUN python setup.py build && python setup.py install
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz && tar -zxvf pip-18.0.tar.gz
+WORKDIR pip-18.0
+RUN python setup.py install && \
+  python3.8 setup.py install && \
+  python3.7 setup.py install && \
+  python3.6 setup.py install && \
+  python3 setup.py install 
+
+WORKDIR /home
+RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \
+    rm -r Python-$version setuptools-40.6.2 pip-18.0
+
+# Install Go and glide
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+
+# Install TensorRT
+# following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+#    and its size is only one-third of the official one.
+# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
+
+# Downgrade TensorRT 
+COPY tools/dockerfile/build_scripts /build_scripts
+RUN bash /build_scripts/install_trt.sh
+RUN rm -rf /build_scripts
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
+# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
+# version(1.7.1 for now), which causes building documentation failed.
+RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.8 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3.8 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.8 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
+
+RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 --no-cache-dir install opencv-python && \
+    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.6 --no-cache-dir install opencv-python && \
+    pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.7 --no-cache-dir install opencv-python && \
+    pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.8 --no-cache-dir install opencv-python && \
+    pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip --no-cache-dir install opencv-python
+
+#For docstring checker
+RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.7 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip --no-cache-dir install pylint pytest astroid isort LinkChecker
+
+RUN pip3 --no-cache-dir install coverage && \
+    pip3.6 --no-cache-dir install coverage && \
+    pip3.7 --no-cache-dir install coverage && \
+    pip3.8 --no-cache-dir install coverage && \
+    pip --no-cache-dir install coverage
+
+COPY ./python/requirements.txt /root/
+RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.7 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip --no-cache-dir install -r /root/requirements.txt
+
+# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
+# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
+RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y && \
+    pip3 --no-cache-dir install certifi urllib3[secure] && \
+    pip3.6 --no-cache-dir install certifi urllib3[secure] && \
+    pip3.7 --no-cache-dir install certifi urllib3[secure] && \
+    pip3.8 --no-cache-dir install certifi urllib3[secure] && \
+    pip --no-cache-dir install certifi urllib3[secure]
+
+# ar mishandles 4GB files
+# https://sourceware.org/bugzilla/show_bug.cgi?id=14625
+# remove them when apt-get support 2.27 and higher version
+RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \
+    tar -xzf binutils_2.27.orig.tar.gz && \
+    cd binutils-2.27 && \
+    ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
+
+RUN pip --no-cache-dir install -U netifaces==0.10.9
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN wget -q http://mirrors.kernel.org/ubuntu/pool/universe/p/patchelf/patchelf_0.10-2_amd64.deb && \
+    dpkg -i patchelf_0.10-2_amd64.deb
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+CMD source ~/.bashrc
+
+EXPOSE 22
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
new file mode 100644
index 0000000000000..c42e9f25fe519
--- /dev/null
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+# Top-level build script called from Dockerfile
+
+# Stop at any error, show all commands
+set -ex
+
+# Python versions to be installed in /opt/$VERSION_NO
+# NOTE Only need python 2.7.11 for nupic.core/nupic.bindings at this time, so
+# remove others to expedite build and reduce docker image size. The original
+# manylinux docker image project builds many python versions.
+# NOTE We added back 3.5.1, since auditwheel requires python 3.3+
+CPYTHON_VERSIONS="3.8.0 3.7.0 3.6.0 3.5.1 2.7.15"
+
+# openssl version to build, with expected sha256 hash of .tar.gz
+# archive
+OPENSSL_ROOT=openssl-1.1.0i
+OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
+EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
+DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
+PATCHELF_HASH=f2aa40a6148cb3b0ca807a1bf836b081793e55ec9e5540a5356d800132be7e0a
+CURL_ROOT=curl-7.49.1
+CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
+AUTOCONF_ROOT=autoconf-2.69
+AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
+
+# Dependencies for compiling Python that we want to remove from
+# the final image after compiling Python
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
+
+# Libraries that are allowed as part of the manylinux1 profile
+MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
+
+# Get build utilities
+MY_DIR=$(dirname "${BASH_SOURCE[0]}")
+source $MY_DIR/build_utils.sh
+
+# EPEL support
+yum -y install wget curl
+curl -sLO https://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
+check_sha256sum epel-release-6-8.noarch.rpm $EPEL_RPM_HASH
+
+# Dev toolset (for LLVM and other projects requiring C++11 support)
+curl -sLO http://people.centos.org/tru/devtools-2/devtools-2.repo
+check_sha256sum devtools-2.repo $DEVTOOLS_HASH
+mv devtools-2.repo /etc/yum.repos.d/devtools-2.repo
+rpm -Uvh --replacepkgs epel-release-6*.rpm
+rm -f epel-release-6*.rpm
+
+# Development tools and libraries
+yum -y install bzip2 make git patch unzip bison yasm diffutils \
+    automake which file \
+    kernel-devel-`uname -r` \
+    devtoolset-2-binutils devtoolset-2-gcc \
+    devtoolset-2-gcc-c++ devtoolset-2-gcc-gfortran \
+    ${PYTHON_COMPILE_DEPS}
+
+# Install more recent version of cmake
+# curl -O https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.sh
+# /bin/sh cmake-3.8.1-Linux-x86_64.sh --prefix=/usr/local --skip-license
+# rm cmake-3.8.1-Linux-x86_64.sh
+
+wget -q https://cmake.org/files/v3.16/cmake-3.16.0.tar.gz && tar xzf cmake-3.16.0.tar.gz && \
+cd cmake-3.16.0 && ./bootstrap && \
+make -j8 && make install && cd .. && rm cmake-3.16.0.tar.gz
+
+# Install newest autoconf
+build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
+autoconf --version
+
+# Compile the latest Python releases.
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+build_openssl $OPENSSL_ROOT $OPENSSL_HASH
+mkdir -p /opt/python
+build_cpythons $CPYTHON_VERSIONS
+
+PY35_BIN=/opt/python/cp35-cp35m/bin
+PY36_BIN=/opt/python/cp36-cp36m/bin
+PY37_BIN=/opt/python/cp37-cp37m/bin
+PY38_BIN=/opt/python/cp38-cp38m/bin
+# NOTE Since our custom manylinux image builds pythons with shared
+# libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
+# python.
+ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib"
+
+# Our openssl doesn't know how to find the system CA trust store
+#   (https://github.com/pypa/manylinux/issues/53)
+# And it's not clear how up-to-date that is anyway
+# So let's just use the same one pip and everyone uses
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi
+ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \
+      /opt/_internal/certs.pem
+# If you modify this line you also have to modify the versions in the
+# Dockerfiles:
+export SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# Install newest curl
+build_curl $CURL_ROOT $CURL_HASH
+rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
+hash -r
+curl --version
+curl-config --features
+
+# Now we can delete our built SSL
+rm -rf /usr/local/ssl
+
+# Install patchelf (latest with unreleased bug fixes)
+# FIXME(typhoonzero): restore this when the link is fixed.
+# curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
+# check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+# tar -xzf patchelf-0.9njs2.tar.gz
+# (cd patchelf-0.9njs2 && ./configure && make && make install)
+# rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
+yum install -y patchelf
+
+# Install latest pypi release of auditwheel
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
+ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
+
+# Clean up development headers and other unnecessary stuff for
+# final image
+yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+    avahi freetype bitstream-vera-fonts \
+    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1 || true
+yum -y install ${MANYLINUX1_DEPS} && yum -y clean all > /dev/null 2>&1 || true
+yum list installed
+# we don't need libpython*.a, and they're many megabytes
+find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
+# Strip what we can -- and ignore errors, because this just attempts to strip
+# *everything*, including non-ELF files:
+find /opt/_internal -type f -print0 \
+    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
+# We do not need the Python test suites, or indeed the precompiled .pyc and
+# .pyo files. Partially cribbed from:
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
+find /opt/_internal \
+     \( -type d -a -name test -o -name tests \) \
+  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
+  -print0 | xargs -0 rm -f
+
+for PYTHON in /opt/python/*/bin/python; do
+    # Add matching directory of libpython shared library to library lookup path
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib"
+
+    # Smoke test to make sure that our Pythons work, and do indeed detect as
+    # being manylinux compatible:
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
+    # Make sure that SSL cert checking works
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
+done
+
+# Restore LD_LIBRARY_PATH
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
+
+# According to ar issues: https://lists.gnu.org/archive/html/bug-binutils/2016-05/msg00211.html
+# we should install new version ar with 64-bit supported here
+wget https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz
+tar xzf binutils-2.27.tar.gz && cd binutils-2.27
+./configure --prefix=/opt/rh/devtoolset-2/root/usr/ --enable-64-bit-archive && make -j `nproc` && make install
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
new file mode 100755
index 0000000000000..6f201a8579fea
--- /dev/null
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -0,0 +1,198 @@
+#!/bin/bash
+# Helper utilities for build
+
+PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+# XXX: the official https server at www.openssl.org cannot be reached
+# with the old versions of openssl and curl in Centos 5.11 hence the fallback
+# to the ftp mirror:
+# OPENSSL_DOWNLOAD_URL=ftp://ftp.openssl.org/source
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source
+# Ditto the curl sources
+CURL_DOWNLOAD_URL=http://curl.askapache.com/download
+
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf
+
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+
+function lex_pyver {
+    # Echoes Python version string padded with zeros
+    # Thus:
+    # 3.2.1 -> 003002001
+    # 3     -> 003000000
+    echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}'
+}
+
+
+function do_cpython_build {
+    local py_ver=$1
+    check_var $py_ver
+    local ucs_setting=$2
+    check_var $ucs_setting
+    tar -xzf Python-$py_ver.tgz
+    pushd Python-$py_ver
+    if [ "$ucs_setting" = "none" ]; then
+        unicode_flags=""
+        dir_suffix=""
+    else
+        local unicode_flags="--enable-unicode=$ucs_setting"
+        local dir_suffix="-$ucs_setting"
+    fi
+    local prefix="/opt/_internal/cpython-${py_ver}${dir_suffix}"
+    mkdir -p ${prefix}/lib
+    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
+
+    if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.6) ]; then
+        wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
+        tar -zxf sqlite-autoconf-3250300.tar.gz
+        cd sqlite-autoconf-3250300
+        ./configure --prefix=/usr/local
+        make -j8 && make install
+        cd ../ && rm sqlite-autoconf-3250300.tar.gz
+    fi
+
+    # NOTE --enable-shared for generating libpython shared library needed for
+    # linking of some of the nupic.core test executables.
+    if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
+        # NOTE python 3.7 should be installed via make altinstall rather than
+        # make install, and we should specify the location of ssl
+        CFLAGS="-Wformat" ./configure --prefix=${prefix} --with-openssl=/usr/local/ssl --enable-shared $unicode_flags > /dev/null
+        make -j8 > /dev/null
+        make altinstall > /dev/null
+    else
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null
+        LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null
+    fi
+    popd
+    echo "ZZZ looking for libpython"
+    find / -name 'libpython*.so*'
+    rm -rf Python-$py_ver
+    # Some python's install as bin/python3. Make them available as
+    # bin/python.
+    if [ -e ${prefix}/bin/python3 ]; then
+        ln -s python3 ${prefix}/bin/python
+    fi
+    if [ -e ${prefix}/bin/python3.7 ]; then
+        ln -s python3.7 ${prefix}/bin/python
+    fi
+    if [ -e ${prefix}/bin/python3.8 ]; then
+        ln -s python3.8 ${prefix}/bin/python
+    fi
+    # NOTE Make libpython shared library visible to python calls below
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    cd /
+    ls ${MY_DIR}
+    local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
+    ln -s ${prefix} /opt/python/${abi_tag}
+}
+
+
+function build_cpython {
+    local py_ver=$1
+    check_var $py_ver
+    check_var $PYTHON_DOWNLOAD_URL
+    wget -q $PYTHON_DOWNLOAD_URL/$py_ver/Python-$py_ver.tgz
+    if [ $(lex_pyver $py_ver) -lt $(lex_pyver 3.3) ]; then
+        # NOTE We only need wide unicode for nupic.bindings wheel
+        do_cpython_build $py_ver ucs2
+        do_cpython_build $py_ver ucs4
+    else
+        do_cpython_build $py_ver none
+    fi
+    rm -f Python-$py_ver.tgz
+}
+
+
+function build_cpythons {
+    for py_ver in $@; do
+        check_var $GET_PIP_URL
+        curl -sLO $GET_PIP_URL
+        build_cpython $py_ver
+    done
+    rm get-pip.py
+}
+
+
+function do_openssl_build {
+    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function check_sha256sum {
+    local fname=$1
+    check_var ${fname}
+    local sha256=$2
+    check_var ${sha256}
+
+    echo "${sha256}  ${fname}" > ${fname}.sha256
+    sha256sum -c ${fname}.sha256
+    rm ${fname}.sha256
+}
+
+
+function build_openssl {
+    local openssl_fname=$1
+    check_var ${openssl_fname}
+    local openssl_sha256=$2
+    check_var ${openssl_sha256}
+    check_var ${OPENSSL_DOWNLOAD_URL}
+    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
+    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
+    tar -xzf ${openssl_fname}.tar.gz
+    (cd ${openssl_fname} && do_openssl_build)
+    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
+}
+
+
+function do_curl_build {
+    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_curl {
+    local curl_fname=$1
+    check_var ${curl_fname}
+    local curl_sha256=$2
+    check_var ${curl_sha256}
+    check_var ${CURL_DOWNLOAD_URL}
+    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
+    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
+    tar -jxf ${curl_fname}.tar.bz2
+    (cd ${curl_fname} && do_curl_build)
+    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
+}
+
+
+function do_standard_install {
+    ./configure > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_autoconf {
+    local autoconf_fname=$1
+    check_var ${autoconf_fname}
+    local autoconf_sha256=$2
+    check_var ${autoconf_sha256}
+    check_var ${AUTOCONF_DOWNLOAD_URL}
+    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
+    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
+    tar -zxf ${autoconf_fname}.tar.gz
+    (cd ${autoconf_fname} && do_standard_install)
+    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
+}
diff --git a/tools/dockerfile/build_scripts/install_gcc.sh b/tools/dockerfile/build_scripts/install_gcc.sh
new file mode 100644
index 0000000000000..d7db757813303
--- /dev/null
+++ b/tools/dockerfile/build_scripts/install_gcc.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Top-level build script called from Dockerfile
+
+# Stop at any error, show all commands
+set -ex
+
+if [ -f "/etc/redhat-release" ];then
+  lib_so_5=/usr/lib64/libstdc++.so.6
+  lib_so_6=/usr/lib64/libstdc++.so.6
+  lib_path=/usr/lib64
+else
+  lib_so_5=/usr/lib/x86_64-linux-gnu/libstdc++.so.5
+  lib_so_6=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
+  lib_path=/usr/lib/x86_64-linux-gnu
+fi
+
+if [ "$1" == "gcc82" ]; then
+  wget https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz 
+  tar -xvf gcc-8.2.0.tar.xz && \
+  cd gcc-8.2.0 && \
+  unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+  ./contrib/download_prerequisites && \
+  cd .. && mkdir temp_gcc82 && cd temp_gcc82 && \
+  ../gcc-8.2.0/configure --prefix=/usr/local/gcc-8.2 --enable-threads=posix --disable-checking --disable-multilib && \
+  make -j8 && make install
+  cd .. && rm -rf temp_gcc82
+  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+  ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \
+  ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
+  cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
+fi
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
new file mode 100644
index 0000000000000..6307a52edd18b
--- /dev/null
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
+if [ "$VERSION" == "10.0" ]; then
+  DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
+elif [ "$VERSION" == "10.1" ]; then
+  DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
+elif [ "$VERSION" == "9.0" ]; then
+  DEB="nccl-repo-ubuntu1604-2.3.7-ga-cuda9.0_1-1_amd64.deb"
+else
+  DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda8.0_1-1_amd64.deb"
+fi
+
+URL="http://nccl2-deb.gz.bcebos.com/$DEB"
+
+DIR="/nccl2"
+mkdir -p $DIR
+# we cached the nccl2 deb package in BOS, so we can download it with wget
+# install nccl2: http://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#down
+wget -O $DIR/$DEB $URL
+
+cd $DIR && ar x $DEB && tar xf data.tar.xz
+DEBS=$(find ./var/ -name "*.deb")
+for sub_deb in $DEBS; do
+  echo $sub_deb
+  ar x $sub_deb && tar xf data.tar.xz
+done
+mv -f usr/include/nccl.h /usr/local/include/
+mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
+rm -rf $DIR
diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh
new file mode 100644
index 0000000000000..70297042bc6f4
--- /dev/null
+++ b/tools/dockerfile/build_scripts/install_trt.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
+
+if [[ "$VERSION" == "10.1" ]];then
+  wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT6-cuda10.1-cudnn7.tar.gz --no-check-certificate
+  tar -zxf TensorRT6-cuda10.1-cudnn7.tar.gz -C /usr/local
+  cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/include/* /usr/include/ && cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/lib/* /usr/lib/
+elif [[ "$VERSION" == "10.0" ]];then
+  wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT6-cuda10.0-cudnn7.tar.gz --no-check-certificate
+  tar -zxf TensorRT6-cuda10.0-cudnn7.tar.gz -C /usr/local
+  cp -rf /usr/local/TensorRT6-cuda10.0-cudnn7/include/* /usr/include/ && cp -rf /usr/local/TensorRT6-cuda10.0-cudnn7/lib/* /usr/lib/
+elif [[ "$VERSION" == "9.0" ]];then
+  wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT6-cuda9.0-cudnn7.tar.gz --no-check-certificate
+  tar -zxf TensorRT6-cuda9.0-cudnn7.tar.gz -C /usr/local
+  cp -rf /usr/local/TensorRT6-cuda9.0-cudnn7/include/* /usr/include/ && cp -rf /usr/local/TensorRT6-cuda9.0-cudnn7/lib/* /usr/lib/
+fi
diff --git a/tools/dockerfile/build_scripts/manylinux1-check.py b/tools/dockerfile/build_scripts/manylinux1-check.py
new file mode 100644
index 0000000000000..0d1a6df4eec98
--- /dev/null
+++ b/tools/dockerfile/build_scripts/manylinux1-check.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Logic copied from PEP 513
+
+
+def is_manylinux1_compatible():
+    # Only Linux, and only x86-64 / i686
+    from distutils.util import get_platform
+    if get_platform() not in ["linux-x86_64", "linux-i686"]:
+        return False
+
+    # Check for presence of _manylinux module
+    try:
+        import _manylinux
+        return bool(_manylinux.manylinux1_compatible)
+    except (ImportError, AttributeError):
+        # Fall through to heuristic check below
+        pass
+
+    # Check glibc version. CentOS 5 uses glibc 2.5.
+    return have_compatible_glibc(2, 5)
+
+
+def have_compatible_glibc(major, minimum_minor):
+    import ctypes
+
+    process_namespace = ctypes.CDLL(None)
+    try:
+        gnu_get_libc_version = process_namespace.gnu_get_libc_version
+    except AttributeError:
+        # Symbol doesn't exist -> therefore, we are not linked to
+        # glibc.
+        return False
+
+    # Call gnu_get_libc_version, which returns a string like "2.5".
+    gnu_get_libc_version.restype = ctypes.c_char_p
+    version_str = gnu_get_libc_version()
+    # py2 / py3 compatibility:
+    if not isinstance(version_str, str):
+        version_str = version_str.decode("ascii")
+
+    # Parse string and check against requested version.
+    version = [int(piece) for piece in version_str.split(".")]
+    assert len(version) == 2
+    if major != version[0]:
+        return False
+    if minimum_minor > version[1]:
+        return False
+    return True
+
+
+import sys
+if is_manylinux1_compatible():
+    print("%s is manylinux1 compatible" % (sys.executable, ))
+    sys.exit(0)
+else:
+    print("%s is NOT manylinux1 compatible" % (sys.executable, ))
+    sys.exit(1)
diff --git a/python/paddle/fleet/base/obj_creator.py b/tools/dockerfile/build_scripts/python-tag-abi-tag.py
similarity index 60%
rename from python/paddle/fleet/base/obj_creator.py
rename to tools/dockerfile/build_scripts/python-tag-abi-tag.py
index 15a403d79edcf..0364ab3659e49 100644
--- a/python/paddle/fleet/base/obj_creator.py
+++ b/tools/dockerfile/build_scripts/python-tag-abi-tag.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from util_base import UtilBase
+# Utility script to print the python tag + the abi tag for a Python
+# See PEP 425 for exactly what these are, but an example would be:
+#   cp27-cp27mu
 
+from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
 
-def _create_fleet_obj_from_role_maker(role_maker):
-    pass
-
-
-def _create_fleet_util_from_role_maker(role_maker):
-    pass
+print("{0}{1}-{2}".format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))
diff --git a/tools/dockerfile/build_scripts/ssl-check.py b/tools/dockerfile/build_scripts/ssl-check.py
new file mode 100644
index 0000000000000..afef2812f3fb4
--- /dev/null
+++ b/tools/dockerfile/build_scripts/ssl-check.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cf. https://github.com/pypa/manylinux/issues/53
+
+GOOD_SSL = "https://google.com"
+BAD_SSL = "https://self-signed.badssl.com"
+
+import sys
+
+print("Testing SSL certificate checking for Python:", sys.version)
+
+if (sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4)):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlopen
+    EXC = OSError
+else:
+    from urllib import urlopen
+    EXC = IOError
+
+print("Connecting to %s should work" % (GOOD_SSL, ))
+urlopen(GOOD_SSL)
+print("...it did, yay.")
+
+print("Connecting to %s should fail" % (BAD_SSL, ))
+try:
+    urlopen(BAD_SSL)
+    # If we get here then we failed:
+    print("...it DIDN'T!!!!!11!!1one!")
+    sys.exit(1)
+except EXC:
+    print("...it did, yay.")
diff --git a/tools/dockerfile/centos6_manylinux.sh b/tools/dockerfile/centos6_manylinux.sh
new file mode 100755
index 0000000000000..ea9c8a7bf36f0
--- /dev/null
+++ b/tools/dockerfile/centos6_manylinux.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+set -xe
+
+REPO="${REPO:-paddledocker}"
+
+function make_cuda9cudnn7(){
+  sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.centos >Dockerfile.tmp
+}
+
+
+function make_cuda10cudnn7() {
+  sed 's/<baseimg>/10.0-cudnn7-devel-centos6/g' Dockerfile.centos >Dockerfile.tmp
+}
+
+
+function make_cuda101cudnn7() {
+  sed 's/<baseimg>/10.1-cudnn7-devel-centos6/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#COPY build_scripts /build_scripts#COPY build_scripts /build_scripts \nRUN bash build_scripts/install_gcc.sh gcc82 \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH#g" Dockerfile.tmp
+}
+
+
+function main() {
+  local CMD=$1 
+  case $CMD in
+    cuda9cudnn7)
+      make_cuda9cudnn7
+      ;;
+    cuda10cudnn7)
+      make_cuda10cudnn7
+      ;;
+    cuda101cudnn7)
+      make_cuda101cudnn7
+      ;;
+    *)
+      echo "Make dockerfile error, Without this paramet."
+      exit 1
+      ;;
+  esac
+}
+
+main $@
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
new file mode 100644
index 0000000000000..f088c5728a5d5
--- /dev/null
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+function make_ubuntu_dockerfile(){
+  dockerfile_name="Dockerfile.cuda10_cudnn7_gcc82_ubuntu16"
+  sed 's/<baseimg>/10.1-cudnn7-devel-ubuntu16.04/g' ./Dockerfile.ubuntu >${dockerfile_name}
+  sed -i 's#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g' ${dockerfile_name} 
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+  dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
+  sed -i 's#<install_gcc>#WORKDIR /usr/bin \
+      COPY tools/dockerfile/build_scripts /build_scripts \
+      RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
+      RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' ${dockerfile_name}
+
+}
+
+
+function make_centos_dockerfile(){
+  dockerfile_name="Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
+  sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
+  sed -i 's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g' ${dockerfile_name} 
+  dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+}
+
+
+function main() {
+  make_ubuntu_dockerfile
+  make_centos_dockerfile
+}
+
+main $@
diff --git a/tools/dockerfile/icode.sh b/tools/dockerfile/icode.sh
new file mode 100755
index 0000000000000..da3ffb8c77db7
--- /dev/null
+++ b/tools/dockerfile/icode.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+
+function install_gcc(){
+  sed -i 's#<install_gcc>#RUN apt-get update \
+    WORKDIR /usr/bin \
+    RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' $1
+}
+
+
+function install_gcc8(){
+  sed -i 's#<install_gcc>#WORKDIR /usr/bin \
+    COPY tools/dockerfile/build_scripts /build_scripts \
+    RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
+    RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
+    ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' $1
+}
+
+
+function centos_gcc8(){
+  sed -i "s#COPY build_scripts /build_scripts#COPY build_scripts /build_scripts \nRUN bash build_scripts/install_gcc.sh gcc82 \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH#g" $1
+}
+
+
+function fix_https(){
+  sed -i 's#https#http#g' /etc/apt/sources.list.d/nvidia-ml.list 
+  sed -i 's#https#http#g' /etc/apt/sources.list.d/cuda.list
+}
+
+
+function all_change(){
+    sed -i 's#ENV HOME /root#ENV HOME /root\nENV DEBIAN_FRONTEND=noninteractive#g' Dockerfile.ubuntu
+}
+
+function centos() {
+  # centos6
+  sed 's#<baseimg>#8.0-cudnn7-devel-centos6#g'  Dockerfile.centos >test/centos_6_cpu_runtime.dockerfile 
+  sed 's#<baseimg>#9.0-cudnn7-devel-centos6#g'  Dockerfile.centos >test/centos_6_gpu_cuda9.0_cudnn7_single_gpu_runtime.dockerfile
+  sed 's#<baseimg>#9.1-cudnn7-devel-centos6#g'  Dockerfile.centos >test/centos_6_gpu_cuda9.1_cudnn7_single_gpu_runtime.dockerfile
+  sed 's#<baseimg>#9.2-cudnn7-devel-centos6#g'  Dockerfile.centos >test/centos_6_gpu_cuda9.2_cudnn7_single_gpu_runtime.dockerfile
+  sed 's#<baseimg>#10.0-cudnn7-devel-centos6#g' Dockerfile.centos >test/centos_6_gpu_cuda10.0_cudnn7_single_gpu_runtime.dockerfile
+  sed 's#<baseimg>#10.1-cudnn7-devel-centos6#g' Dockerfile.centos >test/centos_6_gpu_cuda10.1_cudnn7_single_gpu_runtime.dockerfile
+  centos_gcc8 "test/centos_6_gpu_cuda10.1_cudnn7_single_gpu_runtime.dockerfile"
+  
+  # centos7
+  sed 's#<baseimg>#8.0-cudnn7-devel-centos7#g'  Dockerfile.centos >test/centos_7_cpu_runtime.dockerfile 
+  sed 's#<baseimg>#9.0-cudnn7-devel-centos7#g'  Dockerfile.centos >test/centos_7_gpu_cuda9.0_cudnn7_single_gpu_runtime.dockerfile
+  sed 's#<baseimg>#9.1-cudnn7-devel-centos7#g'  Dockerfile.centos >test/centos_7_gpu_cuda9.1_cudnn7_single_gpu_runtime.dockerfile
+  sed 's#<baseimg>#9.2-cudnn7-devel-centos7#g'  Dockerfile.centos >test/centos_7_gpu_cuda9.2_cudnn7_single_gpu_runtime.dockerfile
+  sed 's#<baseimg>#10.0-cudnn7-devel-centos7#g' Dockerfile.centos >test/centos_7_gpu_cuda10.0_cudnn7_single_gpu_runtime.dockerfile
+  sed 's#<baseimg>#10.1-cudnn7-devel-centos7#g' Dockerfile.centos >test/centos_7_gpu_cuda10.1_cudnn7_single_gpu_runtime.dockerfile
+  centos_gcc8 "test/centos_7_gpu_cuda10.1_cudnn7_single_gpu_runtime.dockerfile"
+}
+
+
+function ubuntu() {
+  # ubuntu 14
+  sed 's#<baseimg>#8.0-cudnn7-devel-ubuntu14.04#g'  Dockerfile.ubuntu >test/ubuntu_1404_cpu.dockerfile
+  install_gcc "test/ubuntu_1404_cpu.dockerfile"
+  sed 's#<baseimg>#10.0-cudnn7-devel-ubuntu14.04#g' Dockerfile.ubuntu >test/ubuntu_1404_gpu_cuda10.0_cudnn7_runtime.dockerfile
+  install_gcc "test/ubuntu_1404_gpu_cuda10.0_cudnn7_runtime.dockerfile"
+  sed 's#<baseimg>#10.1-cudnn7-devel-ubuntu14.04#g' Dockerfile.ubuntu >test/ubuntu_1404_gpu_cuda10.1_cudnn7_runtime.dockerfile
+  install_gcc8 "test/ubuntu_1404_gpu_cuda10.1_cudnn7_runtime.dockerfile"
+ 
+  # ubuntu 16
+  sed 's#<baseimg>#8.0-cudnn7-devel-ubuntu16.04#g'  Dockerfile.ubuntu >test/ubuntu_1604_cpu.dockerfile
+  install_gcc "test/ubuntu_1604_cpu.dockerfile"
+  sed 's#<baseimg>#9.0-cudnn7-devel-ubuntu16.04#g'  Dockerfile.ubuntu >test/ubuntu_1604_gpu_cuda9.0_cudnn7_runtime.dockerfile
+  install_gcc "test/ubuntu_1604_gpu_cuda9.0_cudnn7_runtime.dockerfile"
+  sed 's#<baseimg>#9.1-cudnn7-devel-ubuntu16.04#g'  Dockerfile.ubuntu >test/ubuntu_1604_gpu_cuda9.1_cudnn7_runtime.dockerfile
+  install_gcc "test/ubuntu_1604_gpu_cuda9.1_cudnn7_runtime.dockerfile"
+  sed 's#<baseimg>#9.2-cudnn7-devel-ubuntu16.04#g'  Dockerfile.ubuntu >test/ubuntu_1604_gpu_cuda9.2_cudnn7_runtime.dockerfile
+  install_gcc "test/ubuntu_1604_gpu_cuda9.2_cudnn7_runtime.dockerfile"
+  sed 's#<baseimg>#10.0-cudnn7-devel-ubuntu16.04#g' Dockerfile.ubuntu >test/ubuntu_1604_gpu_cuda10.0_cudnn7_runtime.dockerfile
+  install_gcc "test/ubuntu_1604_gpu_cuda10.0_cudnn7_runtime.dockerfile"
+  sed 's#<baseimg>#10.1-cudnn7-devel-ubuntu16.04#g' Dockerfile.ubuntu >test/ubuntu_1604_gpu_cuda10.1_cudnn7_runtime.dockerfile
+  install_gcc8 "test/ubuntu_1604_gpu_cuda10.1_cudnn7_runtime.dockerfile"
+
+  # ubuntu 18
+  sed 's#<baseimg>#8.0-cudnn7-devel-ubuntu18.04#g'  Dockerfile.ubuntu >test/ubuntu_1804_cpu.dockerfile
+  install_gcc "test/ubuntu_1804_cpu.dockerfile"
+  sed 's#<baseimg>#9.0-cudnn7-devel-ubuntu18.04#g'  Dockerfile.ubuntu >test/ubuntu_1804_gpu_cuda9.0_cudnn7_runtime.dockerfile
+  install_gcc "test/ubuntu_1804_gpu_cuda9.0_cudnn7_runtime.dockerfile"
+  sed 's#<baseimg>#9.1-cudnn7-devel-ubuntu18.04#g'  Dockerfile.ubuntu >test/ubuntu_1804_gpu_cuda9.1_cudnn7_runtime.dockerfile
+  install_gcc "test/ubuntu_1804_gpu_cuda9.1_cudnn7_runtime.dockerfile"
+  sed 's#<baseimg>#9.2-cudnn7-devel-ubuntu18.04#g'  Dockerfile.ubuntu >test/ubuntu_1804_gpu_cuda9.2_cudnn7_runtime.dockerfile
+  install_gcc "test/ubuntu_1804_gpu_cuda9.2_cudnn7_runtime.dockerfile"
+  sed 's#<baseimg>#10.0-cudnn7-devel-ubuntu18.04#g' Dockerfile.ubuntu >test/ubuntu_1804_gpu_cuda10.0_cudnn7_runtime.dockerfile
+  install_gcc "test/ubuntu_1804_gpu_cuda10.0_cudnn7_runtime.dockerfile"
+  sed 's#<baseimg>#10.1-cudnn7-devel-ubuntu18.04#g' Dockerfile.ubuntu >test/ubuntu_1804_gpu_cuda10.1_cudnn7_runtime.dockerfile
+  install_gcc8 "test/ubuntu_1804_gpu_cuda10.1_cudnn7_runtime.dockerfile"
+}
+
+
+function main() {
+  if [ ! -d "test" ];then
+    mkdir test
+  fi
+  all_change
+  centos
+  ubuntu
+}
+
+
+main
diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh
new file mode 100755
index 0000000000000..e7827b6598eeb
--- /dev/null
+++ b/tools/dockerfile/ubuntu16_dev.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+docker_name=$1
+  
+function ref_whl(){
+  if [[ ${WITH_GPU} == "ON" ]]; then
+      ref_gpu=gpu-cuda${ref_CUDA_MAJOR}-cudnn${CUDNN_MAJOR}
+      install_gpu="_gpu"
+  else
+      ref_gpu="cpu"
+      install_gpu=""
+  fi
+  
+  if [[ ${WITH_MKL} == "ON" ]]; then
+      ref_mkl=mkl
+  else
+      ref_mkl=openblas
+  fi
+
+  if [[ ${gcc_version} == "8.2.0" ]];then
+    ref_gcc=_gcc8.2
+  fi
+  
+  ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
+  
+  if [[ ${PADDLE_BRANCH} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+  else
+    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+  fi
+  
+  if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+  else
+    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+  fi
+}
+
+
+function install_whl(){
+  dockerfile_line=`wc -l Dockerfile.tmp|awk '{print $1}'`
+  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle_whl} && pip install ${ref_paddle_whl} && rm -f  ${ref_paddle_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle3_whl} && pip3.5 install ${ref_paddle3_whl} && rm  -f ${ref_paddle3_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle36_whl} && pip3.6 install ${ref_paddle36_whl} && rm -f ${ref_paddle36_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle37_whl} && pip3.7 install ${ref_paddle37_whl} && rm -f ${ref_paddle37_whl}" Dockerfile.tmp
+}
+
+function install_gcc(){
+  if [ "${gcc_version}" == "8.2.0" ];then
+    sed -i 's#<install_gcc>#WORKDIR /usr/bin \
+      COPY tools/dockerfile/build_scripts /build_scripts \
+      RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
+      RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' Dockerfile.tmp
+  else
+    sed -i 's#<install_gcc>#RUN apt-get update \
+      WORKDIR /usr/bin \
+      RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' Dockerfile.tmp
+  fi
+}
+
+
+
+function make_dockerfile(){
+  sed "s/<baseimg>/${docker_name}/g" tools/dockerfile/Dockerfile.ubuntu >Dockerfile.tmp
+}
+
+function main(){
+  make_dockerfile
+  install_gcc
+  ref_whl
+  install_whl
+}
+
+main $@
diff --git a/tools/manylinux1/Dockerfile.GCC8 b/tools/manylinux1/Dockerfile.GCC8
deleted file mode 100644
index 52593c42b294d..0000000000000
--- a/tools/manylinux1/Dockerfile.GCC8
+++ /dev/null
@@ -1,191 +0,0 @@
-# A image for building paddle binaries and install
-# Use cuda devel base image for both cpu and gpu environment
-# When you modify it, please be aware of cudnn-runtime version
-# and libcudnn.so.x in paddle/scripts/docker/build.sh
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-# ENV variables
-ARG WITH_GPU
-ARG WITH_AVX
- 
-ENV WITH_GPU=${WITH_GPU:-ON}
-ENV WITH_AVX=${WITH_AVX:-ON}
-
-ENV HOME /root
-# Add bash enhancements
-COPY ./paddle/scripts/docker/root/ /root/
-
-# Prepare packages for Python
-RUN apt-get update && \
- DEBIAN_FRONTEND=noninteractive apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
- libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
- xz-utils tk-dev libffi-dev liblzma-dev
-RUN apt-get install -y python-dev python-pip wget vim git
-
-# install cmake
-WORKDIR /home
-RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz
-RUN tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz
-RUN apt install libidn11
-ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
-WORKDIR /usr/bin
-RUN wget -q http://mirror.linux-ia64.org/gnu/gcc/releases/gcc-8.2.0/gcc-8.2.0.tar.xz && \
-  tar -xvf gcc-8.2.0.tar.xz && \
-  cd gcc-8.2.0 && \
-  unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
-  ./contrib/download_prerequisites && \
-  cd .. && mkdir temp_gcc82 && cd temp_gcc82 && \
-  ../gcc-8.2.0/configure --prefix=/usr/local/gcc-8.2 --enable-threads=posix --disable-checking --disable-multilib && \
-  make -j8 && make install
-RUN cp gcc gcc.bak
-RUN cp g++ g++.bak
-RUN rm gcc
-RUN rm g++
-RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc
-RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++
-RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc
-RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++
-ENV PATH=/usr/local/gcc-8.2/bin:$PATH
-RUN cd .. && rm -rf /usr/bin/temp_gcc82
-
-# Install Python3.6
-RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
-    tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
-    ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
-    wget -q https://www.python.org/ftp/python/3.6.9/Python-3.6.9.tgz && \
-    tar -xzf Python-3.6.9.tgz && cd Python-3.6.9 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
-    make -j8 > /dev/null && make altinstall > /dev/null
-
-# Install Python3.7
-RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
-    tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
-    make -j8 > /dev/null && make altinstall > /dev/null
-
-RUN rm -r /root/python_build
-
-RUN apt-get update && \
-    apt-get install -y --allow-downgrades --allow-change-held-packages \
-    patchelf python3 python3-dev python3-pip \
-    git python-pip python-dev python-opencv openssh-server bison \
-    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
-    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-matplotlib \
-    automake locales clang-format swig  \
-    liblapack-dev liblapacke-dev \
-    net-tools libtool module-init-tools && \
-    apt-get clean -y
-
-# Install Python2.7.15 to replace original python
-WORKDIR /home
-ENV version=2.7.15
-RUN wget https://www.python.org/ftp/python/$version/Python-$version.tgz
-RUN tar -xvf Python-$version.tgz
-WORKDIR /home/Python-$version
-RUN ./configure --enable-unicode=ucs4 --enable-shared CFLAGS=-fPIC --prefix=/usr/local/python2.7.15
-RUN make && make install
-
-RUN echo "export PATH=/usr/local/python2.7.15/include:${PATH}" >> ~/.bashrc
-RUN echo "export PATH=/usr/local/python2.7.15/bin:${PATH}" >> ~/.bashrc
-RUN echo "export LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH}" >> ~/.bashrc
-RUN echo "export CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH" >> ~/.bashrc
-ENV PATH=/usr/local/python2.7.15/include:${PATH}
-ENV PATH=/usr/local/python2.7.15/bin:${PATH}
-ENV LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH}
-ENV CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH
-RUN mv /usr/bin/python /usr/bin/python.bak
-RUN ln -s /usr/local/python2.7.15/bin/python2.7 /usr/local/bin/python
-RUN ln -s /usr/local/python2.7.15/bin/python2.7 /usr/bin/python
-WORKDIR /home
-RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip
-RUN apt-get -y install unzip
-RUN unzip setuptools-40.6.2.zip
-WORKDIR /home/setuptools-40.6.2
-RUN python setup.py build
-RUN python setup.py install
-WORKDIR /home
-RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz
-RUN tar -zxvf pip-18.0.tar.gz
-WORKDIR pip-18.0
-RUN python setup.py install
-
-WORKDIR /home
-RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \
-    rm -r Python-$version setuptools-40.6.2 pip-18.0
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
-# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
-# version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
-
-RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
-    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
-    pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
-    pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
-
-#For docstring checker
-RUN pip3 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
-RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
-RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
-
-RUN pip3 --no-cache-dir install coverage
-RUN pip3.6 --no-cache-dir install coverage
-RUN pip3.7 --no-cache-dir install coverage
-RUN pip --no-cache-dir install coverage
-
-COPY ./python/requirements.txt /root/
-RUN pip3 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
-RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
-RUN pip --no-cache-dir install -r /root/requirements.txt
-
-# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
-# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
-RUN pip3 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
-RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
-RUN pip --no-cache-dir install certifi urllib3[secure]
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-CMD source ~/.bashrc
-EXPOSE 22
-
-# ccache 3.7.9
-RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
-    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
-    ./configure -prefix=/usr/local/ccache-3.7.9 && \
-    make -j8 && make install && \
-    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
diff --git a/tools/manylinux1/Dockerfile.Inference b/tools/manylinux1/Dockerfile.Inference
index e045fc52109e8..0ba180b894b22 120000
--- a/tools/manylinux1/Dockerfile.Inference
+++ b/tools/manylinux1/Dockerfile.Inference
@@ -1 +1 @@
-Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
\ No newline at end of file
+Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
\ No newline at end of file
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index 5bb471da0e94f..837f0e486f611 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -41,6 +41,7 @@ RUN wget -q https://paddle-docker-tar.bj.bcebos.com/home/users/tianshuo/bce-pyth
   make -j8 && make install
 
 ENV PATH=/usr/local/gcc-8.2/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/gcc-8.2/lib64:$LD_LIBRARY_PATH
 RUN rm -rf /temp_gcc82 && rm -rf /gcc-8.2.0.tar.xz && rm -rf /gcc-8.2.0
 
 # Install Python3.6
@@ -142,6 +143,11 @@ RUN wget -q https://paddlepaddledeps.bj.bcebos.com/TensorRT-6.0.1.5.Ubuntu-16.04
     tar -zxf TensorRT-6.0.1.5.Ubuntu-16.04.x86_64-gnu.cuda-10.1.cudnn7.tar.gz -C /usr/local && \
     cp -rf /usr/local/TensorRT-6.0.1.5/include/* /usr/include/ && cp -rf /usr/local/TensorRT-6.0.1.5/lib/* /usr/lib/ 
 
+# Install patchelf-0.10 
+RUN wget https://paddle-ci.gz.bcebos.com/patchelf-0.10.tar.gz && \
+    tar -zxvf patchelf-0.10.tar.gz && cd patchelf-0.10 && \
+    ./configure && make -j8 && make install
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
diff --git a/tools/wlist.json b/tools/wlist.json
index 5382bce6356b5..6989882504ede 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -108,6 +108,11 @@
         "Metric.accumulate",
         "Metric.name",
         "Metric.add_metric_op",
+        "Accuracy.reset",
+        "Accuracy.update",
+        "Accuracy.accumulate",
+        "Accuracy.name",
+        "Accuracy.add_metric_op",
         "Callback.set_params",
         "Callback.on_train_begin",
         "Callback.on_train_end",
@@ -122,7 +127,8 @@
         "Callback.on_eval_batch_begin",
         "Callback.on_eval_batch_end",
         "Callback.on_test_batch_begin",
-        "Callback.on_test_batch_end"
+        "Callback.on_test_batch_end",
+        "Model.prepare"
     ],
     "wlist_no_op_pass":[
         "gelu",