diff --git a/AUTHORS.md b/AUTHORS.md
index e5481d83de190..a8ea5c46e94d2 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -57,6 +57,7 @@
 | reyoung | Yang Yu |
 | [Sand3r-](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Michal Gallus |
 | [sfraczek](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Sylwester Fraczek |
+| Silv3S | Slawomir Siwek |
 | sneaxiy | Jin-Le Zeng |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9002cb287e855..ff49ba164dd7f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,7 +100,11 @@ if(APPLE AND WITH_ARM)
 endif()
 
 if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+    if(WITH_ARM_BRPC)
+        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+    else()
+        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+    endif()
 endif()
 
 if(WIN32)
@@ -386,7 +390,7 @@ if(WITH_DISTRIBUTE)
     if(LINUX)
         set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
     endif()
-    if(WITH_ASCEND_CL)
+    if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
         # disable WITH_PSCORE for NPU before include third_party
         MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
         set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
diff --git a/README.md b/README.md
index c4c5decec5430..21e0aba8b48bf 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
      [Click here to learn more](https://github.com/PaddlePaddle/Fleet)
 
 
-- **High-Performance Inference Engines for Comprehensive Deployment Enviroments**
+- **High-Performance Inference Engines for Comprehensive Deployment Environments**
 
    PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.
      
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index 4641184fcf527..4d813a0726dc0 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -238,7 +238,7 @@ foreach (GCOV_FILE ${GCOV_FILES})
 	message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
 
 	# Loads the gcov file as a list of lines.
-	# (We first open the file and replace all occurences of [] with _
+	# (We first open the file and replace all occurrences of [] with _
 	#  because CMake will fail to parse a line containing unmatched brackets...
 	#  also the \ to escaped \n in macros screws up things.)
 	# https://public.kitware.com/Bug/view.php?id=15369
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index cda8029bfe4e4..d5ccf1297922f 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220425")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -17,7 +17,7 @@ endif()
 # ubuntu and centos: use output by XDNN API team
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220412")
+  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220425")
 else()
   SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 5742a6b602ff3..f9cac0579fec4 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -158,12 +158,15 @@ if(WITH_IPU)
     )
 endif()
 
+if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
+    set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new)
+endif()
+
 if(NOT APPLE)
     if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
         set(COMMON_FLAGS
                 ${COMMON_FLAGS}
                 -Wno-format-truncation # Warning in boost gcc 8.2
-                -Wno-error=cast-function-type # Warning in boost gcc 8.2
                 -Wno-error=parentheses # Warning in boost gcc 8.2
                 -Wno-error=catch-value # Warning in boost gcc 8.2
                 -Wno-error=nonnull-compare # Warning in boost gcc 8.2
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
index ef57bb5ba232c..31f9b26e732d1 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
+#include <chrono>
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/include/api.h"
@@ -24,6 +25,8 @@ namespace paddle {
 namespace distributed {
 
 using Place = paddle::platform::Place;
+int ProcessGroupHeter::send_count = 0;
+int ProcessGroupHeter::recv_count = 0;
 
 std::shared_ptr<ProcessGroupHeter::HeterTask> ProcessGroupHeter::CreateTask(
     int rank, CommType comm_type, const std::vector<phi::DenseTensor>& inputs) {
@@ -47,7 +50,8 @@ bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
 ProcessGroupHeter::ProcessGroupHeter(
     const std::shared_ptr<Store>& store, int rank, int size,
     const platform::Place& place, int gid, int local_rank, int local_size,
-    int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint)
+    int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint,
+    int src_rank, int dst_rank)
     : ProcessGroup(rank, size, place, gid),
       store_(store),
       local_rank_(local_rank),
@@ -55,7 +59,10 @@ ProcessGroupHeter::ProcessGroupHeter(
       gloo_rank_(gloo_rank),
       gloo_size_(gloo_size),
       with_switch_(with_switch),
-      switch_endpoint_(switch_endpoint) {
+      switch_endpoint_(switch_endpoint),
+      src_rank_(src_rank),
+      dst_rank_(dst_rank) {
+  return;
 #if defined(PADDLE_WITH_NCCL)
   inner_pg_ = std::make_shared<ProcessGroupNCCL>(store, local_rank, local_size,
                                                  place_, IGNORE_ID);
@@ -116,7 +123,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
         HeterClient* client_ =
             HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
         auto dense_cpu_tensor = cpu_tensors[0];
-        std::vector<int> send_size;
+        std::vector<int64_t> send_size;
         send_size.push_back(dense_cpu_tensor.numel());
         int ret = client_->Send(
             gid_, {dense_cpu_tensor.name()}, send_size, dense_cpu_tensor.data(),
@@ -212,7 +219,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
             HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
         auto dense_cpu_tensor = cpu_tensors[0];
         if (gloo_rank_ == 0) {
-          std::vector<int> send_size;
+          std::vector<int64_t> send_size;
           send_size.push_back(dense_cpu_tensor.numel());
           int ret = client_->Send(
               gid_, {dense_cpu_tensor.name()}, send_size,
@@ -246,5 +253,100 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
   return CreateTask(rank_, CommType::BROADCAST, in_tensors);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
+    std::vector<phi::DenseTensor>& in_tensors, int peer) {
+#if defined(PADDLE_WITH_NCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+#endif
+
+  PADDLE_ENFORCE_EQ(
+      in_tensors.size(), 1,
+      platform::errors::PreconditionNotMet(
+          "For each send operation, there can only be one tensor to send."));
+  // Copy Tensor to cpu
+  auto start = std::chrono::high_resolution_clock::now();
+  phi::DenseTensor cpu_tensor;
+  auto& gpu_tensor = in_tensors[0];
+  framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
+  PADDLE_ENFORCE_EQ(with_switch_, true,
+                    platform::errors::PreconditionNotMet(
+                        "Gloo does not support the send operation."));
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = end - start;
+  VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
+          << ") from gpu to cpu for send " << std::setw(9)
+          << " is: " << diff.count() << " s" << std::endl;
+
+  // Send to switch
+  HeterClient* client_ =
+      HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+  int64_t tensor_size =
+      cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype());
+  std::vector<int64_t> send_size;
+  send_size.push_back(tensor_size);
+  auto id = src_rank_ * 10000 + dst_rank_;
+  std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
+                            std::string("_") + std::to_string(send_count++);
+  VLOG(2) << "tensor_name:" << tensor_name;
+  int ret = client_->Send(gid_, {tensor_name}, send_size, cpu_tensor.data(),
+                          tensor_size);
+  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
+                                "Send to the switch module error."));
+  return CreateTask(rank_, CommType::SEND, in_tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
+    std::vector<phi::DenseTensor>& out_tensors, int peer) {
+#if defined(PADDLE_WITH_NCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+#endif
+
+  PADDLE_ENFORCE_EQ(
+      out_tensors.size(), 1,
+      platform::errors::PreconditionNotMet(
+          "For each rece operation, there can only be one tensor to receive."));
+
+  // Copy Tensor to cpu
+  phi::DenseTensor cpu_tensor;
+  auto& gpu_tensor = out_tensors[0];
+  cpu_tensor.Resize(gpu_tensor.dims());
+  cpu_tensor.set_layout(gpu_tensor.layout());
+  cpu_tensor.mutable_data(platform::CPUPlace(), gpu_tensor.dtype());
+
+  PADDLE_ENFORCE_EQ(with_switch_, true,
+                    platform::errors::PreconditionNotMet(
+                        "Gloo does not support the send operation."));
+  // recv from switch
+  HeterClient* client_ =
+      HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+  auto id = src_rank_ * 10000 + dst_rank_;
+  std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
+                            std::string("_") + std::to_string(recv_count++);
+  VLOG(2) << "tensor_name: " << tensor_name;
+  auto start = std::chrono::high_resolution_clock::now();
+  int ret = client_->Recv(
+      gid_, {tensor_name}, cpu_tensor.data(),
+      cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
+  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
+                                "receive to the switch module error."));
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = end - start;
+  double goodput = cpu_tensor.numel() *
+                   framework::DataTypeSize(cpu_tensor.dtype()) / diff.count();
+  VLOG(2) << "Goodput: " << goodput << "B/s" << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
+  end = std::chrono::high_resolution_clock::now();
+  diff = end - start;
+  VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
+          << ") from gpu to cpu for recv " << std::setw(9)
+          << " is: " << diff.count() << " s" << std::endl;
+  return CreateTask(rank_, CommType::RECV, out_tensors);
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
index 640acdfb6a23b..89b0f078b4af5 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
@@ -83,7 +83,8 @@ class ProcessGroupHeter : public ProcessGroup {
   ProcessGroupHeter(const std::shared_ptr<Store>& store, int rank, int size,
                     const platform::Place& place, int gid, int local_rank,
                     int local_size, int gloo_rank, int gloo_size,
-                    bool with_switch, std::string switch_endpoints);
+                    bool with_switch, std::string switch_endpoints,
+                    int src_rank, int dst_rank);
 
   const std::string GetBackendName() const override {
     return std::string(HETER_BACKEND_NAME);
@@ -97,6 +98,12 @@ class ProcessGroupHeter : public ProcessGroup {
       std::vector<phi::DenseTensor>&, std::vector<phi::DenseTensor>&,
       const BroadcastOptions& = BroadcastOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& in_tensors, int peer) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& out_tensors, int peer) override;
+
  protected:
   virtual std::shared_ptr<ProcessGroupHeter::HeterTask> CreateTask(
       int rank, CommType opType, const std::vector<phi::DenseTensor>& inputs);
@@ -112,6 +119,10 @@ class ProcessGroupHeter : public ProcessGroup {
   int gloo_size_;
   bool with_switch_;
   std::string switch_endpoint_;
+  int src_rank_;
+  int dst_rank_;
+  static int send_count;
+  static int recv_count;
 };
 
 }  //  namespace distributed
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 75153df936b1c..a7c3e2208ab74 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -447,10 +447,12 @@ void EagerReducer::TraverseBackwardGraph(const std::vector<Tensor> &outputs) {
   while (!queue.empty()) {
     egr::GradNodeBase *node = queue.front();
     queue.pop();
-    const std::vector<std::vector<egr::Edge>> &edges = node->GetEdges();
-    for (size_t i = 0; i < edges.size(); i++) {
-      for (size_t j = 0; j < edges[i].size(); j++) {
-        const egr::Edge &edge = edges[i][j];
+    const paddle::small_vector<std::vector<egr::GradSlotMeta>,
+                               egr::kSlotSmallVectorSize> &metas =
+        node->OutputMeta();
+    for (size_t i = 0; i < metas.size(); i++) {
+      for (size_t j = 0; j < metas[i].size(); j++) {
+        const egr::Edge &edge = metas[i][j].GetEdge();
         auto next_node_shared = edge.GetMutableGradNode();
         if (!next_node_shared || !next_node_shared.get()) {
           continue;
diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt
index f0ac7bc6a0635..e7519ef4998b1 100755
--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -1,10 +1,15 @@
 set(BRPC_SRCS ps_client.cc server.cc)
 set_source_files_properties(${BRPC_SRCS})
 
+
 if(WITH_HETERPS)
+
     set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog device_context rocksdb)
+
 else()
+
     set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog device_context)
+
 endif()
 
 brpc_library(sendrecv_rpc SRCS
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
old mode 100644
new mode 100755
index 921a110984a4a..78673184eb23b
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -55,8 +55,6 @@ DEFINE_int32(pserver_sparse_merge_thread, 1, "pserver sparse merge thread num");
 DEFINE_int32(pserver_sparse_table_shard_num, 1000,
              "sparse table shard for save & load");
 
-DEFINE_int32(heter_world_size, 100, "group size");  // 可配置
-
 namespace paddle {
 namespace framework {
 class Scope;
diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc
old mode 100644
new mode 100755
index 16c1ff764dc3c..fd0962caaaead
--- a/paddle/fluid/distributed/ps/service/heter_client.cc
+++ b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -17,10 +17,14 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DEFINE_int32(heter_world_size, 100, "group size");  // group max size
+DEFINE_int32(switch_send_recv_timeout_s, 600, "switch_send_recv_timeout_s");
+
 namespace paddle {
 namespace distributed {
-
 std::shared_ptr<HeterClient> HeterClient::s_instance_ = nullptr;
+std::mutex HeterClient::mtx_;
+std::shared_ptr<HeterClient> HeterClient::switch_s_instance_ = nullptr;
 
 int GetMicroId(const platform::DeviceContext& ctx,
                const framework::Scope* scope) {
@@ -222,6 +226,7 @@ int HeterClient::Send(const platform::DeviceContext& ctx,
   distributed::MultiVarMsg request;
   // 1. set req message_name(string)
   request.set_message_name(message_name);
+  request.set_group_id(0);
 
   // 2. set req send_var_names(<string>)
   for (auto& send_var_name : send_var_names) {
@@ -263,7 +268,7 @@ int HeterClient::Send(const platform::DeviceContext& ctx,
 }
 
 int HeterClient::Send(int group_id, const std::vector<std::string>& var_names,
-                      const std::vector<int>& vars_len, void* data_ptr,
+                      const std::vector<int64_t>& vars_size, void* data_ptr,
                       int64_t data_size) {
   OnHeterRpcDone* closure = new OnHeterRpcDone([](void* done) {
     auto* closure = reinterpret_cast<OnHeterRpcDone*>(done);
@@ -282,7 +287,7 @@ int HeterClient::Send(int group_id, const std::vector<std::string>& var_names,
   for (auto& send_var_name : var_names) {
     request.add_send_var_names(send_var_name);
   }
-  for (auto var_len : vars_len) {
+  for (auto var_len : vars_size) {
     request.add_vars_len(var_len);
   }
   auto& request_buffer = closure->cntl.request_attachment();
@@ -301,6 +306,7 @@ int HeterClient::Send(int group_id, const std::vector<std::string>& var_names,
   ::paddle::distributed::PsService_Stub stub(channel);
   stub.SendToSwitch(&closure->cntl, &request, &closure->ps_response, closure);
   fut.wait();
+  delete closure;
   return 0;
 }
 
@@ -325,6 +331,7 @@ int HeterClient::Recv(const platform::DeviceContext& ctx,
   distributed::MultiVarMsg request;
   // 1. set req message_name(string)
   request.set_message_name(message_name);
+  request.set_group_id(0);
 
   // 2. set req recv_var_names(<string>)
   for (auto& recv_var_name : recv_var_names) {
@@ -396,8 +403,8 @@ int HeterClient::Recv(int group_id, const std::vector<std::string>& var_names,
   // save in worker
   auto& res_io_buffer = closure->cntl.response_attachment();
   butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
-  io_buffer_itr.copy_and_forward(reinterpret_cast<void*>(data_ptr),
-                                 data_size * sizeof(float));
+  io_buffer_itr.copy_and_forward(reinterpret_cast<void*>(data_ptr), data_size);
+  delete closure;
   VLOG(4) << "Recv done";
   return 0;
 }
diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h
old mode 100755
new mode 100644
index d1e0f21c7dd84..36bafc943701f
--- a/paddle/fluid/distributed/ps/service/heter_client.h
+++ b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -138,7 +138,8 @@ class HeterClient {
                         const std::string& mode = "forward");
 
   int Send(int group_id, const std::vector<std::string>& var_names,
-           const std::vector<int>& vars_len, void* data_ptr, int64_t data_size);
+           const std::vector<int64_t>& vars_len, void* data_ptr,
+           int64_t data_size);
 
   int Send(const platform::DeviceContext& ctx, const framework::Scope& scope,
            const std::string& message_name,
@@ -168,16 +169,22 @@ class HeterClient {
   }
 
   // switch client singleton
-  static HeterClient& GetSwitchInstance(
+  static std::shared_ptr<HeterClient> GetSwitchInstance(
       const std::vector<std::string>& peer_endpoints, int32_t peer_role) {
-    static HeterClient switch_s_instance_;
-    if (peer_endpoints.empty()) {
-      VLOG(4) << "init switch client failed, null peer_endpoints";
+    if (switch_s_instance_ == nullptr) {
+      std::unique_lock<std::mutex> lock(mtx_);
+      if (peer_endpoints.empty()) {
+        VLOG(4) << "init switch client failed, null peer_endpoints";
+      }
+      VLOG(4) << "peer role is: " << peer_role
+              << ", addr is: " << peer_endpoints[0];
+      if (switch_s_instance_ == nullptr) {
+        switch_s_instance_.reset(new HeterClient());
+        switch_s_instance_->SetPeerSwitchList(peer_endpoints);
+        switch_s_instance_->InitClientChannels(false, peer_endpoints,
+                                               peer_role);
+      }
     }
-    VLOG(4) << "peer role is: " << peer_role
-            << ", addr is: " << peer_endpoints[0];
-    switch_s_instance_.SetPeerSwitchList(peer_endpoints);
-    switch_s_instance_.InitClientChannels(false, peer_endpoints, peer_role);
     return switch_s_instance_;
   }
 
@@ -229,6 +236,8 @@ class HeterClient {
   HeterClient(const HeterClient&);
 
   static std::shared_ptr<HeterClient> s_instance_;
+  static std::mutex mtx_;
+  static std::shared_ptr<HeterClient> switch_s_instance_;
   std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
   std::vector<std::shared_ptr<brpc::Channel>> previous_xpu_channels_;
 
diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc
index 292b12611c494..0753a6799c1be 100755
--- a/paddle/fluid/distributed/ps/service/heter_server.cc
+++ b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -20,8 +20,8 @@ namespace paddle {
 namespace distributed {
 // DEFINE_string(cert_path, "./cert.pem", "cert.pem path");
 // DEFINE_string(key_path, "./key.pem", "key.pem path");
-
 std::shared_ptr<HeterServer> HeterServer::s_instance_ = nullptr;
+std::mutex HeterServer::mtx_;
 
 void HeterServer::RegisterServiceHandler(std::string message_name,
                                          HeterServiceHandler func) {
@@ -130,21 +130,15 @@ int SendAndRecvVariableHandler::SaveInSwitchWithShard(
   butil::IOBufBytesIterator io_buffer_itr(request_io_buffer);
   for (int idx = 0; idx < request->send_var_names_size(); idx++) {
     const auto& var_name = request->send_var_names(idx);
-    const auto& var_len = request->vars_len(idx);
-    auto itr = local_shard.find(var_name);
-    if (itr != local_shard.end()) {
-      LOG(INFO) << "var: " << var_name << "has not been consumed!"
-                << "check again";
-      WaitForVarsConsumed(group_id, var_name);
-    }
+    const auto& var_size = request->vars_len(idx);
+    WaitForVarsConsumed(group_id, var_name);
     auto& value = local_shard[var_name];
-    value.resize(var_len);
+    value.resize(var_size);
     io_buffer_itr.copy_and_forward(reinterpret_cast<void*>(value.data()),
-                                   var_len * sizeof(float));
-    VLOG(4) << "saved data in shards: ";
-    for (uint32_t i = 0; i < local_shard[var_name].size(); i++) {
-      VLOG(4) << *(local_shard[var_name].data() + i);
-    }
+                                   var_size);
+    std::unique_lock<std::mutex> lk(scope_mutex_);
+    vars_ready_flag[group_id][var_name] = 1;
+    VLOG(4) << "saved var_name: " << var_name << "is saved ready!";
   }
   VLOG(4) << "SaveInSwitchWithShard success";
   return 0;
@@ -164,20 +158,17 @@ int SendAndRecvVariableHandler::QueryInSwitchWithShard(
   }
   auto msg_name = request->message_name();
   response->set_message_name(msg_name);
-
   for (auto& req_var_name : req_var_names) {
     VLOG(4) << "req var name: " << req_var_name;
     response->add_send_var_names(req_var_name);
+    WaitForVarsProduced(group_id, req_var_name);
     auto itr = local_shard.find(req_var_name);
-    if (itr == local_shard.end()) {
-      LOG(INFO) << "var: " << req_var_name << " not found in shards";
-      WaitForVarsProduced(group_id, req_var_name);
-    }
-    LOG(INFO) << "var: " << req_var_name << " found in shards";
-    itr = local_shard.find(req_var_name);
     auto& value = itr.value();
-    response_io_buffer.append(value.data(), value.size() * sizeof(float));
-    value.resize(0);  // 标记位
+    response_io_buffer.append(value.data(), value.size());
+    value.resize(0);  // 清空内存
+    std::unique_lock<std::mutex> lk(scope_mutex_);
+    vars_ready_flag[group_id][req_var_name] = 0;
+    VLOG(4) << "query var_name: " << req_var_name << "is consumed ready!";
   }
   VLOG(4) << "heter server QueryInSwitchWithShard done";
   return 0;
@@ -192,37 +183,31 @@ int SendAndRecvVariableHandler::SaveInSwitchWithScope(
   auto& cpu_dev_ctx = *pool.Get(cpu_place);
   auto message_name = request->message_name();
   VLOG(4) << "message_name in heter server: " << message_name;
+
+  auto send_var_nums = request->send_var_names_size();
+  std::vector<std::string> send_var_names(send_var_nums);
+  for (int idx = 0; idx < send_var_nums; idx++) {
+    send_var_names[idx] = request->var_messages(idx).varname();
+  }
   std::unique_lock<std::mutex> lk(scope_mutex_);
   auto local_scope = local_scope_ptr.get();
   if (!local_scope) {
     LOG(ERROR) << "local_scope_ptr is null in SaveInSwitchWithScope";
   }
-  for (int idx = 0; idx < request->send_var_names_size(); idx++) {
-    const auto& msg = request->var_messages(idx);
-    std::string var_name = msg.varname();
+  for (auto var_name : send_var_names) {
     auto* var_exist_ptr = local_scope->FindVar(var_name);
     if (!var_exist_ptr) {
       VLOG(4) << "not find var: " << var_name << " in local_scope";
     }
-    vars_table[var_name] += 1;
-    VLOG(4) << "saved var_name: " << var_name
-            << ", cnt = " << vars_table[var_name];
+    WaitForVarsConsumed(0, var_name);
   }
   auto& request_io_buffer = cntl->request_attachment();
   distributed::DeserializeFromMultiVarMsgAndIOBuf(*request, &request_io_buffer,
                                                   cpu_dev_ctx, local_scope);
   lk.unlock();
-  while (true) {
-    int ret = 0;
-    for (int idx = 0; idx < request->send_var_names_size(); idx++) {
-      ret |= vars_table[request->var_messages(idx).varname()];
-    }
-    if (!ret) {
-      VLOG(4) << "all saved vars consumed";
-      break;
-    }
-    VLOG(4) << "waiting consume result......";
-    sleep(1);
+  for (auto var_name : send_var_names) {
+    std::unique_lock<std::mutex> lk(scope_mutex_);
+    vars_ready_flag[0][var_name] = 1;
   }
   VLOG(4) << "SaveInSwitchWithScope success";
   return 0;
@@ -258,19 +243,14 @@ int SendAndRecvVariableHandler::QueryInSwitchWithScope(
 
   // 3. fill var_messages(VarMessage)
   for (auto& req_var_name : req_var_names) {
-    LOG(INFO) << "query var_name: " << req_var_name;
+    WaitForVarsProduced(0, req_var_name);
     auto* send_var_msg = response->add_var_messages();
     send_var_msg->set_varname(req_var_name);
 
     framework::Variable* var_ptr;
-    while (true) {
-      var_ptr = local_scope->FindVar(req_var_name);
-      if (!var_ptr) {
-        LOG(INFO) << "local_scope not find var: " << req_var_name;
-      } else {
-        break;
-      }
-      sleep(1);
+    var_ptr = local_scope->FindVar(req_var_name);
+    if (!var_ptr) {
+      LOG(INFO) << "local_scope not find var: " << req_var_name;
     }
     butil::IOBuf temp_iobuf;
     if (var_ptr->IsType<framework::LoDTensor>()) {
@@ -282,10 +262,7 @@ int SendAndRecvVariableHandler::QueryInSwitchWithScope(
   }
   for (auto& req_var_name : req_var_names) {
     std::unique_lock<std::mutex> lk(scope_mutex_);
-    vars_table[req_var_name] -= 1;
-    VLOG(4) << "remained var: " << req_var_name
-            << ", cnt = " << vars_table[req_var_name];
-    lk.unlock();
+    vars_ready_flag[0][req_var_name] = 0;
   }
   VLOG(4) << "heter server QueryInSwitchWithScope done";
   return 0;
diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h
old mode 100644
new mode 100755
index 624e76112c7b0..ddcf36bf68d7b
--- a/paddle/fluid/distributed/ps/service/heter_server.h
+++ b/paddle/fluid/distributed/ps/service/heter_server.h
@@ -56,9 +56,10 @@ class Scope;
 DECLARE_double(eager_delete_tensor_gb);
 DECLARE_int32(pserver_timeout_ms);
 DECLARE_int32(heter_world_size);
+DECLARE_int32(switch_send_recv_timeout_s);
+
 namespace paddle {
 namespace distributed {
-
 using MultiVarMsg = MultiVariableMessage;
 using VarMsg = VariableMessage;
 
@@ -95,6 +96,19 @@ using SharedTaskQueue = std::shared_ptr<
     std::unordered_map<int, std::shared_ptr<::paddle::framework::BlockingQueue<
                                 std::pair<std::string, int>>>>>;
 
+class ValueInSwitch {
+ public:
+  ValueInSwitch() {}
+  ~ValueInSwitch() {}
+  char* data() { return _data.data(); }
+  size_t size() { return _data.size(); }
+  void resize(size_t size) { _data.resize(size); }
+  void shrink_to_fit() { _data.shrink_to_fit(); }
+
+ private:
+  std::vector<char> _data;
+};
+
 class SendAndRecvVariableHandler final : public ServiceHandlerBase {
  public:
   SendAndRecvVariableHandler() {
@@ -130,22 +144,41 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
                             brpc::Controller* cntl);
 
   void WaitForVarsConsumed(int32_t group_id, const std::string& var_name) {
-    auto& local_shard = _local_shards[group_id];
-    while (local_shard.find(var_name) != local_shard.end()) {
-      if (local_shard[var_name].size() == 0) {
+    // timeline_.Start();
+    while (true) {
+      {
+        std::lock_guard<std::mutex> lock(scope_mutex_);
+        if (vars_ready_flag[group_id][var_name] == 0) {
+          break;
+        }
+      }
+      /*
+      timeline_.Pause();
+      if (timeline_.ElapsedSec() > FLAGS_switch_send_recv_timeout_s) {
+        VLOG(0) << "vars not consumed exceed 10 miniutes";
         break;
       }
-      VLOG(4) << "waiting consume result......";
-      sleep(1);
+      */
     }
     return;
   }
 
   void WaitForVarsProduced(int32_t group_id, const std::string& var_name) {
-    auto& local_shard = _local_shards[group_id];
-    while (local_shard.find(var_name) == local_shard.end()) {
-      VLOG(4) << "waiting produce result......";
-      sleep(1);
+    // timeline_.Start();
+    while (true) {
+      {
+        std::lock_guard<std::mutex> lock(scope_mutex_);
+        if (vars_ready_flag[group_id][var_name] == 1) {
+          break;
+        }
+      }
+      /*
+      timeline_.Pause();
+      if (timeline_.ElapsedSec() > FLAGS_switch_send_recv_timeout_s) {
+        VLOG(0) << "vars not produced exceed 10 miniutes";
+        break;
+      }
+      */
     }
     return;
   }
@@ -245,10 +278,12 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
   }
 
  public:
-  using shard_type = SparseTableShard<std::string, FixedFeatureValue>;
+  using shard_type = SparseTableShard<std::string, ValueInSwitch>;
   std::shared_ptr<paddle::framework::Scope> local_scope_ptr;  // for switch
-  std::unordered_map<std::string, uint32_t> vars_table;
+  std::unordered_map<uint32_t, std::unordered_map<std::string, uint32_t>>
+      vars_ready_flag;
   std::unique_ptr<shard_type[]> _local_shards;
+  platform::Timer timeline_;
 
  private:
   // share with HeterPipelineTrainer
@@ -354,12 +389,12 @@ class HeterService : public PsService {
                             ::google::protobuf::Closure* done) {
     VLOG(4) << "entering SendToSwitch";
     brpc::ClosureGuard done_guard(done);
-    auto& switch_client_ptr_ =
+    std::shared_ptr<HeterClient> switch_client_ptr_ =
         HeterClient::GetSwitchInstance(peer_endpoints_, PEER_ROLE_IS_SWITCH);
-    if (switch_client_ptr_.peer_switch_channels_.empty()) {
-      LOG(ERROR) << "switch_client_ptr_.peer_switch_channels_ null";
+    if (switch_client_ptr_->peer_switch_channels_.empty()) {
+      LOG(ERROR) << "switch_client_ptr_->peer_switch_channels_ null";
     }
-    brpc::Channel* channel = switch_client_ptr_.peer_switch_channels_[0].get();
+    brpc::Channel* channel = switch_client_ptr_->peer_switch_channels_[0].get();
     brpc::Controller* cntl = static_cast<brpc::Controller*>(controller);
     // proxy: 定义新的 OnHeterRpcDone 对象（或者在类 OnHeterRpcDone 中 reset）
     OnHeterRpcDone* closure2 = new OnHeterRpcDone([](void* done) {
@@ -389,6 +424,7 @@ class HeterService : public PsService {
         std_cntl.response_attachment().movable());
     fut.wait();
     VLOG(4) << "SendToSwitch done";
+    delete closure2;
   }
 
   void SendS2S(::google::protobuf::RpcController* controller,
@@ -421,11 +457,11 @@ class HeterService : public PsService {
     brpc::ClosureGuard done_guard(done);
     brpc::Controller* cntl = static_cast<brpc::Controller*>(controller);
     VLOG(4) << "SendToWorker(client addr) =" << cntl->remote_side();
-    auto& switch_client_ptr_ =
+    std::shared_ptr<distributed::HeterClient> switch_client_ptr_ =
         HeterClient::GetSwitchInstance(peer_endpoints_, PEER_ROLE_IS_WORKER);
     VLOG(4) << "in switch client, peer worker 0: "
-            << switch_client_ptr_.peer_worker_list_[0];
-    brpc::Channel* channel = switch_client_ptr_.peer_worker_channels_[0].get();
+            << switch_client_ptr_->peer_worker_list_[0];
+    brpc::Channel* channel = switch_client_ptr_->peer_worker_channels_[0].get();
 
     auto* closure = reinterpret_cast<OnHeterRpcDone*>(done);
     PsService_Stub stub(channel);
@@ -576,8 +612,11 @@ class HeterServer {
 
   // HeterWrapper singleton
   static std::shared_ptr<HeterServer> GetInstance() {
-    if (NULL == s_instance_) {
-      s_instance_.reset(new HeterServer());
+    if (s_instance_ == nullptr) {
+      std::unique_lock<std::mutex> lock(mtx_);
+      if (NULL == s_instance_) {
+        s_instance_.reset(new HeterServer());
+      }
     }
     return s_instance_;
   }
@@ -587,6 +626,7 @@ class HeterServer {
  private:
   static std::shared_ptr<HeterServer> s_instance_;
   mutable std::mutex mutex_;
+  static std::mutex mtx_;
   std::condition_variable cv_;
   std::condition_variable condition_ready_;
   bool stoped_ = true;
diff --git a/paddle/fluid/distributed/ps/service/sendrecv.proto b/paddle/fluid/distributed/ps/service/sendrecv.proto
index 46dcc2058f4b8..ae6364dd8371e 100755
--- a/paddle/fluid/distributed/ps/service/sendrecv.proto
+++ b/paddle/fluid/distributed/ps/service/sendrecv.proto
@@ -126,7 +126,7 @@ message MultiVariableMessage {
   repeated string recv_var_names = 3;
   repeated VariableMessage var_messages = 4;
   optional bytes data = 5;
-  repeated int32 vars_len = 6;
+  repeated int64 vars_len = 6;
   optional int32 group_id = 7;
 };
 
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index a9cd0021c8578..a3fa80b3865e4 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -28,6 +28,22 @@ namespace paddle {
 namespace distributed {
 
 #ifdef PADDLE_WITH_HETERPS
+int32_t GraphTable::Load_to_ssd(const std::string &path,
+                                const std::string &param) {
+  bool load_edge = (param[0] == 'e');
+  bool load_node = (param[0] == 'n');
+  if (load_edge) {
+    bool reverse_edge = (param[1] == '<');
+    std::string edge_type = param.substr(2);
+    return this->load_edges_to_ssd(path, reverse_edge, edge_type);
+  }
+  if (load_node) {
+    std::string node_type = param.substr(1);
+    return this->load_nodes(path, node_type);
+  }
+  return 0;
+}
+
 paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
     int idx, std::vector<int64_t> ids) {
   std::vector<std::vector<int64_t>> bags(task_pool_size_);
@@ -38,11 +54,11 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   std::vector<std::future<int>> tasks;
   std::vector<int64_t> edge_array[task_pool_size_];
   std::vector<paddle::framework::GpuPsGraphNode> node_array[task_pool_size_];
-  for (int i = 0; i < (int)bags.size(); i++) {
+  for (size_t i = 0; i < bags.size(); i++) {
     if (bags[i].size() > 0) {
       tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
         paddle::framework::GpuPsGraphNode x;
-        for (int j = 0; j < (int)bags[i].size(); j++) {
+        for (size_t j = 0; j < bags[i].size(); j++) {
           Node *v = find_node(0, idx, bags[i][j]);
           x.node_id = bags[i][j];
           if (v == NULL) {
@@ -53,7 +69,7 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
             x.neighbor_size = v->get_neighbor_size();
             x.neighbor_offset = edge_array[i].size();
             node_array[i].push_back(x);
-            for (int k = 0; k < x.neighbor_size; k++) {
+            for (size_t k = 0; k < x.neighbor_size; k++) {
               edge_array[i].push_back(v->get_neighbor_id(k));
             }
           }
@@ -64,27 +80,29 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   }
   for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
   paddle::framework::GpuPsCommGraph res;
-  int tot_len = 0;
+  unsigned int tot_len = 0;
   for (int i = 0; i < task_pool_size_; i++) {
-    tot_len += (int)edge_array[i].size();
-  }
-  res.neighbor_size = tot_len;
-  res.node_size = ids.size();
-  res.neighbor_list = new int64_t[tot_len];
-  res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()];
-  int offset = 0, ind = 0;
+    tot_len += edge_array[i].size();
+  }
+  // res.neighbor_size = tot_len;
+  // res.node_size = ids.size();
+  // res.neighbor_list = new int64_t[tot_len];
+  // res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()];
+  res.init_on_cpu(tot_len, (unsigned int)ids.size());
+  unsigned int offset = 0, ind = 0;
   for (int i = 0; i < task_pool_size_; i++) {
     for (int j = 0; j < (int)node_array[i].size(); j++) {
       res.node_list[ind] = node_array[i][j];
       res.node_list[ind++].neighbor_offset += offset;
     }
-    for (int j = 0; j < (int)edge_array[i].size(); j++) {
+    for (size_t j = 0; j < edge_array[i].size(); j++) {
       res.neighbor_list[offset + j] = edge_array[i][j];
     }
     offset += edge_array[i].size();
   }
   return res;
 }
+
 int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id,
                                     char *data, int len) {
   if (_db != NULL) {
@@ -92,8 +110,31 @@ int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id,
     memcpy(ch, &type_id, sizeof(int));
     memcpy(ch + sizeof(int), &idx, sizeof(int));
     memcpy(ch + sizeof(int) * 2, &src_id, sizeof(int64_t));
-    _db->put(src_id % shard_num % task_pool_size_, ch,
-             sizeof(int) * 2 + sizeof(int64_t), (char *)data, len);
+    std::string str;
+    if (_db->get(src_id % shard_num % task_pool_size_, ch,
+                 sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
+      int64_t *stored_data = ((int64_t *)str.c_str());
+      int n = str.size() / sizeof(int64_t);
+      char *new_data = new char[n * sizeof(int64_t) + len];
+      memcpy(new_data, stored_data, n * sizeof(int64_t));
+      memcpy(new_data + n * sizeof(int64_t), data, len);
+      _db->put(src_id % shard_num % task_pool_size_, ch,
+               sizeof(int) * 2 + sizeof(int64_t), (char *)new_data,
+               n * sizeof(int64_t) + len);
+      delete[] new_data;
+    } else {
+      _db->put(src_id % shard_num % task_pool_size_, ch,
+               sizeof(int) * 2 + sizeof(int64_t), (char *)data, len);
+    }
+    _db->flush(src_id % shard_num % task_pool_size_);
+    std::string x;
+    // if (_db->get(src_id % shard_num % task_pool_size_, ch, sizeof(int64_t) +
+    // 2 * sizeof(int), x) ==0){
+    // VLOG(0)<<"put result";
+    // for(int i = 0;i < x.size();i+=8){
+    //   VLOG(0)<<"get an id "<<*((int64_t *)(x.c_str() + i));
+    // }
+    //}
   }
   return 0;
 }
@@ -109,8 +150,8 @@ char *GraphTable::random_sample_neighbor_from_ssd(
   memset(ch, 0, sizeof(int));
   memcpy(ch + sizeof(int), &idx, sizeof(int));
   memcpy(ch + sizeof(int) * 2, &id, sizeof(int64_t));
-  if (_db->get(id % shard_num % task_pool_size_, ch, sizeof(uint64_t), str) ==
-      0) {
+  if (_db->get(id % shard_num % task_pool_size_, ch,
+               sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
     int64_t *data = ((int64_t *)str.c_str());
     int n = str.size() / sizeof(int64_t);
     std::unordered_map<int, int> m;
@@ -142,7 +183,298 @@ char *GraphTable::random_sample_neighbor_from_ssd(
   actual_size = 0;
   return NULL;
 }
+
+int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
+                                                  std::vector<int64_t> &ids) {
+  std::vector<std::vector<int64_t>> bags(task_pool_size_);
+  for (auto x : ids) {
+    int location = x % shard_num % task_pool_size_;
+    bags[location].push_back(x);
+  }
+  std::vector<std::future<int>> tasks;
+  std::vector<int64_t> count(task_pool_size_, 0);
+  for (size_t i = 0; i < bags.size(); i++) {
+    if (bags[i].size() > 0) {
+      tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int {
+
+        char ch[sizeof(int) * 2 + sizeof(int64_t)];
+        memset(ch, 0, sizeof(int));
+        memcpy(ch + sizeof(int), &idx, sizeof(int));
+        for (size_t k = 0; k < bags[i].size(); k++) {
+          auto v = bags[i][k];
+          memcpy(ch + sizeof(int) * 2, &v, sizeof(int64_t));
+          std::string str;
+          if (_db->get(i, ch, sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
+            count[i] += (int64_t)str.size();
+            for (int j = 0; j < str.size(); j += sizeof(int64_t)) {
+              int64_t id = *(int64_t *)(str.c_str() + j);
+              add_comm_edge(idx, v, id);
+            }
+          }
+        }
+        return 0;
+      }));
+    }
+  }
+
+  for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
+  int64_t tot = 0;
+  for (auto x : count) tot += x;
+  return tot;
+}
+
+void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
+  VLOG(2) << "start to make graph partitions , byte_size = " << byte_size
+          << " total memory cost = " << total_memory_cost;
+  if (total_memory_cost == 0) {
+    VLOG(0) << "no edges are detected,make partitions exits";
+    return;
+  }
+  const float a = 2.0, y = 1.25;
+  int64_t gb_size_by_discount = byte_size * 0.8 * device_len;
+  if (gb_size_by_discount <= 0) gb_size_by_discount = 1;
+  int part_len = total_memory_cost / gb_size_by_discount;
+  if (part_len == 0) part_len = 1;
+
+  VLOG(2) << "part_len = " << part_len
+          << " byte size = " << gb_size_by_discount;
+  partitions[idx].clear();
+  partitions[idx].resize(part_len);
+  std::vector<int64_t> memory_remaining(part_len, gb_size_by_discount);
+  std::vector<float> score(part_len, 0);
+  std::unordered_map<int64_t, int> id_map;
+  std::vector<rocksdb::Iterator *> iters;
+  for (int i = 0; i < task_pool_size_; i++) {
+    iters.push_back(_db->get_iterator(i));
+    iters[i]->SeekToFirst();
+  }
+  int next = 0;
+  while (iters.size()) {
+    if (next >= iters.size()) {
+      next = 0;
+    }
+    if (!iters[next]->Valid()) {
+      iters.erase(iters.begin() + next);
+      continue;
+    }
+    std::string key = iters[next]->key().ToString();
+    int temp_idx = *(int *)(key.c_str() + sizeof(int));
+    if (temp_idx != idx) {
+      iters[next]->Next();
+      next++;
+      continue;
+    }
+    std::string value = iters[next]->value().ToString();
+    std::int64_t i_key = *(int64_t *)(key.c_str() + 8);
+    for (int i = 0; i < part_len; i++) {
+      if (memory_remaining[i] < (int64_t)value.size()) {
+        score[i] = -100000.0;
+      } else {
+        score[i] = 0;
+      }
+    }
+    for (int j = 0; j < value.size(); j += sizeof(int64_t)) {
+      int64_t v = *((int64_t *)(value.c_str() + j));
+      int index = -1;
+      if (id_map.find(v) != id_map.end()) {
+        index = id_map[v];
+        score[index]++;
+      }
+    }
+    float base;
+    int index = 0;
+    for (int i = 0; i < part_len; i++) {
+      base = gb_size_by_discount - memory_remaining[i];
+      score[i] -= a * y * std::pow(1.0 * base, y - 1);
+      if (score[i] > score[index]) index = i;
+      VLOG(2) << "score" << i << " = " << score[i] << " memory left "
+              << memory_remaining[i];
+    }
+    id_map[i_key] = index;
+    partitions[idx][index].push_back(i_key);
+    memory_remaining[index] -= (int64_t)value.size();
+    iters[next]->Next();
+    next++;
+  }
+  for (int i = 0; i < part_len; i++) {
+    if (partitions[idx][i].size() == 0) {
+      partitions[idx].erase(partitions[idx].begin() + i);
+      i--;
+      part_len--;
+      continue;
+    }
+    VLOG(2) << " partition " << i << " size = " << partitions[idx][i].size();
+    for (auto x : partitions[idx][i]) {
+      VLOG(2) << "find a id " << x;
+    }
+  }
+  next_partition = 0;
+}
+
+void GraphTable::clear_graph(int idx) {
+  for (auto p : edge_shards[idx]) {
+    delete p;
+  }
+
+  edge_shards[idx].clear();
+  for (size_t i = 0; i < shard_num_per_server; i++) {
+    edge_shards[idx].push_back(new GraphShard());
+  }
+}
+int32_t GraphTable::load_next_partition(int idx) {
+  if (next_partition >= partitions[idx].size()) {
+    VLOG(0) << "partition iteration is done";
+    return -1;
+  }
+  clear_graph(idx);
+  load_graph_to_memory_from_ssd(idx, partitions[idx][next_partition]);
+  next_partition++;
+  return 0;
+}
+int32_t GraphTable::load_edges_to_ssd(const std::string &path,
+                                      bool reverse_edge,
+                                      const std::string &edge_type) {
+  int idx = 0;
+  if (edge_type == "") {
+    VLOG(0) << "edge_type not specified, loading edges to " << id_to_edge[0]
+            << " part";
+  } else {
+    if (edge_to_id.find(edge_type) == edge_to_id.end()) {
+      VLOG(0) << "edge_type " << edge_type
+              << " is not defined, nothing will be loaded";
+      return 0;
+    }
+    idx = edge_to_id[edge_type];
+  }
+  total_memory_cost = 0;
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int64_t count = 0;
+  std::string sample_type = "random";
+  bool is_weighted = false;
+  int valid_count = 0;
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      VLOG(0) << "get a line from file " << line;
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      count++;
+      if (values.size() < 2) continue;
+      auto src_id = std::stoll(values[0]);
+      auto dist_ids = paddle::string::split_string<std::string>(values[1], ";");
+      std::vector<int64_t> dist_data;
+      for (auto x : dist_ids) {
+        dist_data.push_back(std::stoll(x));
+        total_memory_cost += sizeof(int64_t);
+      }
+      add_node_to_ssd(0, idx, src_id, (char *)dist_data.data(),
+                      (int)(dist_data.size() * sizeof(int64_t)));
+    }
+  }
+  VLOG(0) << "total memory cost = " << total_memory_cost << " bytes";
+  return 0;
+}
+
+int32_t GraphTable::dump_edges_to_ssd(int idx) {
+  VLOG(0) << "calling dump edges to ssd";
+  const int64_t fixed_size = 10000;
+  // std::vector<int64_t> edge_array[task_pool_size_];
+  std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
+  std::vector<std::future<int64_t>> tasks;
+  auto &shards = edge_shards[idx];
+  for (size_t i = 0; i < shards.size(); ++i) {
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [&, i, this]() -> int64_t {
+          int64_t cost = 0;
+          std::vector<Node *> &v = shards[i]->get_bucket();
+          std::vector<int64_t> s;
+          size_t ind = i % this->task_pool_size_;
+          for (size_t j = 0; j < v.size(); j++) {
+            for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
+              s.push_back(v[j]->get_neighbor_id(k));
+            }
+            cost += v[j]->get_neighbor_size() * sizeof(int64_t);
+            add_node_to_ssd(0, idx, v[j]->get_id(), (char *)s.data(),
+                            s.size() * sizeof(int64_t));
+          }
+          return cost;
+        }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) total_memory_cost += tasks[i].get();
+  return 0;
+}
+int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
+  VLOG(0) << "make_complementary_graph";
+  const int64_t fixed_size = 10000;
+  // std::vector<int64_t> edge_array[task_pool_size_];
+  std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
+  std::vector<std::future<int>> tasks;
+  auto &shards = edge_shards[idx];
+  for (size_t i = 0; i < shards.size(); ++i) {
+    tasks.push_back(
+        _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int {
+          std::vector<Node *> &v = shards[i]->get_bucket();
+          size_t ind = i % this->task_pool_size_;
+          for (size_t j = 0; j < v.size(); j++) {
+            size_t location = v[j]->get_id();
+            for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
+              count[ind][v[j]->get_neighbor_id(k)]++;
+            }
+          }
+          return 0;
+        }));
+  }
+
+  std::unordered_map<int64_t, int> final_count;
+  std::map<int, std::vector<int64_t>> count_to_id;
+  std::vector<int64_t> buffer;
+  for (auto p : edge_shards[idx]) {
+    delete p;
+  }
+
+  edge_shards[idx].clear();
+  for (size_t i = 0; i < shard_num_per_server; i++) {
+    edge_shards[idx].push_back(new GraphShard());
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  for (int i = 0; i < task_pool_size_; i++) {
+    for (auto &p : count[i]) {
+      final_count[p.first] = final_count[p.first] + p.second;
+    }
+    count[i].clear();
+  }
+  for (auto &p : final_count) {
+    count_to_id[p.second].push_back(p.first);
+    VLOG(2) << p.first << " appear " << p.second << " times";
+  }
+  // std::map<int,std::vector<int64_t>>::iterator iter= count_to_id.rbegin();
+  auto iter = count_to_id.rbegin();
+  while (iter != count_to_id.rend() && byte_size > 0) {
+    for (auto x : iter->second) {
+      buffer.push_back(x);
+      if (buffer.size() >= fixed_size) {
+        int64_t res = load_graph_to_memory_from_ssd(idx, buffer);
+        byte_size -= res;
+      }
+      if (byte_size <= 0) break;
+    }
+    iter++;
+  }
+  if (byte_size > 0 && buffer.size() > 0) {
+    int64_t res = load_graph_to_memory_from_ssd(idx, buffer);
+    byte_size -= res;
+  }
+  std::string sample_type = "random";
+  for (auto &shard : edge_shards[idx]) {
+    auto bucket = shard->get_bucket();
+    for (size_t i = 0; i < bucket.size(); i++) {
+      bucket[i]->build_sampler(sample_type);
+    }
+  }
+  return 0;
+}
 #endif
+
 /*
 int CompleteGraphSampler::run_graph_sampling() {
   pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
@@ -700,9 +1032,11 @@ int32_t GraphTable::build_sampler(int idx, std::string sample_type) {
 }
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge,
                                const std::string &edge_type) {
-  // #ifdef PADDLE_WITH_HETERPS
-  //   if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
-  // #endif
+#ifdef PADDLE_WITH_HETERPS
+  // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+  if (search_level == 2) total_memory_cost = 0;
+  const int64_t fixed_load_edges = 1000000;
+#endif
   int idx = 0;
   if (edge_type == "") {
     VLOG(0) << "edge_type not specified, loading edges to " << id_to_edge[0]
@@ -715,6 +1049,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge,
     }
     idx = edge_to_id[edge_type];
   }
+
   auto paths = paddle::string::split_string<std::string>(path, ";");
   int64_t count = 0;
   std::string sample_type = "random";
@@ -756,13 +1091,33 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge,
       edge_shards[idx][index]->add_graph_node(src_id)->build_edges(is_weighted);
       edge_shards[idx][index]->add_neighbor(src_id, dst_id, weight);
       valid_count++;
+#ifdef PADDLE_WITH_HETERPS
+      // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+      if (count > fixed_load_edges && search_level == 2) {
+        dump_edges_to_ssd(idx);
+        VLOG(0) << "dumping edges to ssd, edge count is reset to 0";
+        clear_graph(idx);
+        count = 0;
+      }
+#endif
     }
   }
   VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
           << path;
 
-  // Build Sampler j
-
+// Build Sampler j
+#ifdef PADDLE_WITH_HETERPS
+  // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+  if (search_level == 2) {
+    if (count > 0) {
+      dump_edges_to_ssd(idx);
+      VLOG(0) << "dumping edges to ssd, edge count is reset to 0";
+      clear_graph(idx);
+      count = 0;
+    }
+    return 0;
+  }
+#endif
   for (auto &shard : edge_shards[idx]) {
     auto bucket = shard->get_bucket();
     for (size_t i = 0; i < bucket.size(); i++) {
@@ -892,7 +1247,6 @@ int32_t GraphTable::random_sample_neighbors(
             scaled_lru->query(i, id_list[i].data(), id_list[i].size(), r);
       }
       int index = 0;
-      uint32_t idx;
       std::vector<SampleResult> sample_res;
       std::vector<SampleKey> sample_keys;
       auto &rng = _shards_task_rng_pool[i];
@@ -911,6 +1265,7 @@ int32_t GraphTable::random_sample_neighbors(
           if (node == nullptr) {
 #ifdef PADDLE_WITH_HETERPS
             if (search_level == 2) {
+              VLOG(2) << "enter sample from ssd";
               char *buffer_addr = random_sample_neighbor_from_ssd(
                   idx, node_id, sample_size, rng, actual_size);
               if (actual_size != 0) {
@@ -1060,6 +1415,26 @@ std::pair<int32_t, std::string> GraphTable::parse_feature(
   return std::make_pair<int32_t, std::string>(-1, "");
 }
 
+std::vector<std::vector<int64_t>> GraphTable::get_all_id(int type_id, int idx,
+                                                         int slice_num) {
+  std::vector<std::vector<int64_t>> res(slice_num);
+  auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
+  std::vector<std::future<std::vector<int64_t>>> tasks;
+  for (int i = 0; i < search_shards.size(); i++) {
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [&search_shards, i]() -> std::vector<int64_t> {
+          return search_shards[i]->get_all_id();
+        }));
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  for (size_t i = 0; i < tasks.size(); i++) {
+    auto ids = tasks[i].get();
+    for (auto &id : ids) res[id % slice_num].push_back(id);
+  }
+  return res;
+}
 int32_t GraphTable::pull_graph_list(int type_id, int idx, int start,
                                     int total_size,
                                     std::unique_ptr<char[]> &buffer,
@@ -1218,6 +1593,9 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
   VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start "
           << shard_start << " shard_end " << shard_end;
   edge_shards.resize(id_to_edge.size());
+#ifdef PADDLE_WITH_HETERPS
+  partitions.resize(id_to_edge.size());
+#endif
   for (int k = 0; k < (int)edge_shards.size(); k++) {
     for (size_t i = 0; i < shard_num_per_server; i++) {
       edge_shards[k].push_back(new GraphShard());
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 059bcb09a0a6e..2d869dc805a94 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -63,7 +63,13 @@ class GraphShard {
     }
     return res;
   }
-
+  std::vector<int64_t> get_all_id() {
+    std::vector<int64_t> res;
+    for (int i = 0; i < (int)bucket.size(); i++) {
+      res.push_back(bucket[i]->get_id());
+    }
+    return res;
+  }
   GraphNode *add_graph_node(int64_t id);
   GraphNode *add_graph_node(Node *node);
   FeatureNode *add_feature_node(int64_t id);
@@ -420,6 +426,10 @@ class GraphTable : public Table {
     use_cache = false;
     shard_num = 0;
     rw_lock.reset(new pthread_rwlock_t());
+#ifdef PADDLE_WITH_HETERPS
+    next_partition = 0;
+    total_memory_cost = 0;
+#endif
   }
   virtual ~GraphTable();
 
@@ -465,6 +475,8 @@ class GraphTable : public Table {
   int32_t load_edges(const std::string &path, bool reverse,
                      const std::string &edge_type);
 
+  std::vector<std::vector<int64_t>> get_all_id(int type, int idx,
+                                               int slice_num);
   int32_t load_nodes(const std::string &path, std::string node_type);
 
   int32_t add_graph_node(int idx, std::vector<int64_t> &id_list,
@@ -513,7 +525,7 @@ class GraphTable : public Table {
       const std::vector<std::vector<std::string>> &res);
 
   size_t get_server_num() { return server_num; }
-
+  void clear_graph(int idx);
   virtual int32_t make_neighbor_sample_cache(size_t size_limit, size_t ttl) {
     {
       std::unique_lock<std::mutex> lock(mutex_);
@@ -538,6 +550,7 @@ class GraphTable : public Table {
   //   graph_sampler->set_graph_sample_callback(callback);
   //   return 0;
   // }
+  virtual void make_partitions(int idx, int64_t gb_size, int device_len);
   virtual char *random_sample_neighbor_from_ssd(
       int idx, int64_t id, int sample_size,
       const std::shared_ptr<std::mt19937_64> rng, int &actual_size);
@@ -545,8 +558,25 @@ class GraphTable : public Table {
                                   char *data, int len);
   virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph(
       int idx, std::vector<int64_t> ids);
+  int32_t Load_to_ssd(const std::string &path, const std::string &param);
+  int64_t load_graph_to_memory_from_ssd(int idx, std::vector<int64_t> &ids);
+  int32_t make_complementary_graph(int idx, int64_t byte_size);
+  int32_t dump_edges_to_ssd(int idx);
+  int32_t get_partition_num(int idx) { return partitions[idx].size(); }
+  std::vector<int64_t> get_partition(int idx, int index) {
+    if (idx >= partitions.size() || index >= partitions[idx].size())
+      return std::vector<int64_t>();
+    return partitions[idx][index];
+  }
+  int32_t load_edges_to_ssd(const std::string &path, bool reverse_edge,
+                            const std::string &edge_type);
+  int32_t load_next_partition(int idx);
+  void set_search_level(int search_level) { this->search_level = search_level; }
   // virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); }
   int search_level;
+  int64_t total_memory_cost;
+  std::vector<std::vector<std::vector<int64_t>>> partitions;
+  int next_partition;
 #endif
   virtual int32_t add_comm_edge(int idx, int64_t src_id, int64_t dst_id);
   virtual int32_t build_sampler(int idx, std::string sample_type = "random");
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
index ec86239ffb161..6516c75a5d696 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -94,9 +94,9 @@ class MemorySparseTable : public Table {
 
  protected:
   const int _task_pool_size = 24;
-  size_t _avg_local_shard_num;
-  size_t _real_local_shard_num;
-  size_t _sparse_table_shard_num;
+  int _avg_local_shard_num;
+  int _real_local_shard_num;
+  int _sparse_table_shard_num;
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::unique_ptr<shard_type[]> _local_shards;
 };
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 10696dbacd35b..08e8f2baef6a0 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/errors.h"
 
 #include "glog/logging.h"
-DECLARE_bool(retain_grad_for_all_tensor);
+
 namespace egr {
 
 static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
@@ -38,10 +38,13 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
   }
 }
 
-std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
-operator()(
-    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-    bool create_graph) {
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+GradNodeAccumulation::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,  // NOLINT
+    bool create_graph,
+    bool is_new_grad) {
   VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
@@ -56,14 +59,15 @@ operator()(
   // Apply Gradient Hooks
   paddle::experimental::Tensor grad_out;
   if (GradientHooksRegistered()) {
-    std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
-        ApplyGradientHooks(grads);
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>
+        hooked_grads = ApplyGradientHooks(grads);
     grad_out = hooked_grads[0][0];
   } else {
     grad_out = grads[0][0];
   }
 
-  if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) {
+  if (!weak_grad_.expired() && !is_new_grad) {
     auto grad = weak_grad_.lock();
     CopyOrAddTensor(grad.get(), grad_out);
   }
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 38d5533c3d606..f37de9c8e88f1 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -37,9 +37,12 @@ class GradNodeAccumulation : public GradNodeBase {
   }
 
   // Functor: perform backward computations
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false) override;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h
index 95313bde02a20..2145f4a11965c 100644
--- a/paddle/fluid/eager/amp_utils.h
+++ b/paddle/fluid/eager/amp_utils.h
@@ -21,8 +21,8 @@ namespace egr {
 
 static inline paddle::experimental::DataType GetPromoteType(
     const std::string& op_name,
-    const std::vector<std::vector<paddle::experimental::Tensor>>&
-        amp_tensors_vector,
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>& amp_tensors_vector,
     const paddle::experimental::DataType& amp_dtype) {
   auto dst_type = amp_dtype;
   if (egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype() ==
@@ -86,8 +86,8 @@ static inline paddle::experimental::DataType GetPromoteType(
 
 inline paddle::experimental::DataType GetAmpDestDtype(
     const std::string& op_name,
-    const std::vector<std::vector<paddle::experimental::Tensor>>&
-        amp_tensors_vector) {
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>& amp_tensors_vector) {
   auto amp_dtype =
       egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype();
   auto amp_level = egr::Controller::Instance().GetAMPLevel();
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index d9f5447a88e9b..8bd40140f53cc 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -144,11 +144,15 @@ void GradNodeScale::SetTensorWrappers_X(
 
 void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
 
-std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
-operator()(
-    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-    bool create_graph) {
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+GradNodeScale::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,  // NOLINT
+    bool create_graph,
+    bool is_new_grad) {
   // 1. Check Output Size
+  VLOG(6) << "grad size is: " << grads.size();
   PADDLE_ENFORCE(
       ((grads.size() == 1) && (grads[0].size() == 1)),
       paddle::platform::errors::Fatal(
@@ -156,15 +160,18 @@ operator()(
           "However received: %d",
           "This indicates an issue with Eager Dygraph Backward logic",
           grads.size()));
-  std::vector<std::vector<paddle::experimental::Tensor>> outs;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs;
   // 2. Create needed out parttern
   paddle::experimental::Tensor out;
   // Apply Gradient Hooks
   if (GradientHooksRegistered()) {
     // TODO(jiabin): Shall we apply hook slot by slot here or accept
     // vector<vector<phi::tensor>> to apply all hooks?
-    std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
-        ApplyGradientHooks(grads);
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>
+        hooked_grads = ApplyGradientHooks(grads);
     ScaleAPI(/* slot by slot set */ hooked_grads[0][0], scale_, 0.0 /* bias */,
              true /* bias_after_scale */, &out);
   } else {
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index dd61ddc486eef..04ff510944dd2 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -38,9 +38,12 @@ class GradNodeScale : public GradNodeBase {
   ~GradNodeScale() override = default;
 
   // Functor: perform backward computations
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false) override;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
@@ -48,7 +51,7 @@ class GradNodeScale : public GradNodeBase {
       const std::vector<paddle::experimental::Tensor>& tensors);
 
   void SetAttributes_scale(float scale);
-  std::string name() override { return ""; }
+  std::string name() override { return "scale node"; }
   // Members: define fwd input tensors
   // For Scale there is no fwd input tensor needed
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index 1be3b31de00a6..7a374d567d5d0 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -79,9 +79,6 @@ paddle::experimental::Tensor scale(const paddle::experimental::Tensor& x,
     // Pass Attributes to GradNode
     scale_node->SetAttributes_scale(scale);
 
-    // Set Next Edges
-    scale_node->AddEdges(p_autograd_in, /*slot id*/ 0);
-
     // Set TensorWrappers
     scale_node->SetTensorWrappers_X({x});
 
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index adfcab961bfe5..44e78c3bbf193 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -19,8 +19,9 @@
 #include <memory>
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/utils/small_vector.h"
 namespace egr {
-
+constexpr size_t kSlotSmallVectorSize = 15U;
 class UniqueNameGenerator {
  public:
   explicit UniqueNameGenerator(std::string prefix = "") : prefix_(prefix) {}
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 307f8fae31597..44fa8461f2fe9 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -56,6 +56,13 @@ static std::unordered_set<std::string> black_ops_list = {"run_program"};
 static std::string LegalizeVariableName(const std::string& var_name) {
   std::string ret = var_name;
   std::replace(ret.begin(), ret.end(), '-', '_');  // replace all '-' to '_'
+  std::replace(ret.begin(), ret.end(), '@', '_');  // replace all '-' to '_'
+  return ret;
+}
+
+static std::string LegalizeVarName(const std::string& var_name) {
+  std::string ret = var_name;
+  std::replace(ret.begin(), ret.end(), '@', '_');  // replace all '-' to '_'
   return ret;
 }
 
@@ -1024,7 +1031,8 @@ static std::string GenerateGradNodeCreationContent(
   // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
+    const std::string& output_autograd_name =
+        "p_autograd_" + LegalizeVarName(output_name);
 
     // output autograd_meta should be got after running TraceOP.
     if (output.duplicable()) {
@@ -1032,12 +1040,13 @@ static std::string GenerateGradNodeCreationContent(
           "    std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::autograd_meta(&%s);\n";
       get_output_autograd_meta_str += paddle::string::Sprintf(
-          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name,
+          LegalizeVarName(output_name));
     } else {
       // In inplace op, the case where output is duplicable is not considered.
       // Replace output directly with input in inplace op.
       if (!inplace_map.empty() && inplace_map.count(output_name)) {
-        auto inplace_input_name = inplace_map[output_name];
+        auto inplace_input_name = LegalizeVarName(inplace_map[output_name]);
         const std::string& inplace_input_autograd_name =
             "p_autograd_" + inplace_input_name;
         const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
@@ -1049,9 +1058,9 @@ static std::string GenerateGradNodeCreationContent(
         const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
             "    egr::AutogradMeta* %s = "
             "egr::EagerUtils::autograd_meta(&%s);\n";
-        get_output_autograd_meta_str +=
-            paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
-                                    output_autograd_name, output_name);
+        get_output_autograd_meta_str += paddle::string::Sprintf(
+            GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name,
+            LegalizeVarName(output_name));
       }
     }
   }
@@ -1061,28 +1070,32 @@ static std::string GenerateGradNodeCreationContent(
   // inplace).
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
-    const std::string& input_autograd_name = "p_autograd_" + input_name;
+    const std::string& input_autograd_name =
+        "p_autograd_" + LegalizeVarName(input_name);
 
     if (input.duplicable()) {
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
           "  std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
       get_input_autograd_meta_str += paddle::string::Sprintf(
-          GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
+          GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name,
+          LegalizeVarName(input_name));
 
     } else if (input.dispensable()) {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
       get_input_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
+          GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name,
+          LegalizeVarName(input_name));
 
     } else {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
       get_input_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
+          GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name,
+          LegalizeVarName(input_name));
     }
   }
   VLOG(6) << "Generated inputs autograd_meta";
@@ -1096,7 +1109,7 @@ static std::string GenerateGradNodeCreationContent(
         "  egr::EagerUtils::CheckInplace(%s, p_autograd_%s, "
         "require_any_grad);\n";
     for (auto& inplace_pair : inplace_map) {
-      std::string inplace_name = inplace_pair.second;
+      std::string inplace_name = LegalizeVarName(inplace_pair.second);
       check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
                                                    inplace_name, inplace_name);
     }
@@ -1159,12 +1172,12 @@ static std::string GenerateGradNodeCreationContent(
       if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
         auto inplace_input_name = inplace_map[tensor_wrapper_name];
         grad_node_creation_str += paddle::string::Sprintf(
-            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-            inplace_input_name, full_reserved);
+            SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
+            LegalizeVarName(inplace_input_name), full_reserved);
       } else {
         grad_node_creation_str += paddle::string::Sprintf(
-            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-            tensor_wrapper_name, full_reserved);
+            SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
+            LegalizeVarName(tensor_wrapper_name), full_reserved);
       }
     }
   }
@@ -1176,7 +1189,8 @@ static std::string GenerateGradNodeCreationContent(
   std::string compute_require_grad_args = "trace_backward";
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
-    const std::string& input_autograd_name = "p_autograd_" + input_name;
+    const std::string& input_autograd_name =
+        "p_autograd_" + LegalizeVarName(input_name);
 
     if (!input.duplicable()) {
       compute_require_grad_args += ", " + input_autograd_name;
@@ -1184,26 +1198,19 @@ static std::string GenerateGradNodeCreationContent(
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
           "      grad_node->SetGradOutMeta(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
-
-      const char* ADD_EDGES_TEMPLATE =
-          "      if(%s) grad_node->AddEdges(%s, %d);\n";
       grad_node_creation_str +=
-          paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name,
-                                  input_autograd_name, input_position);
+          paddle::string::Sprintf(SET_GRAD_OUT_META_TEMPLATE,
+                                  LegalizeVarName(input_name), input_position);
+
     } else {
       compute_require_grad_args += ", &" + input_autograd_name;
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
           "      grad_node->SetGradOutMeta(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
-
-      const char* ADD_EDGES_TEMPLATE = "      grad_node->AddEdges(&%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          ADD_EDGES_TEMPLATE, input_autograd_name, input_position);
+      grad_node_creation_str +=
+          paddle::string::Sprintf(SET_GRAD_OUT_META_TEMPLATE,
+                                  LegalizeVarName(input_name), input_position);
     }
   }
 
@@ -1217,7 +1224,7 @@ static std::string GenerateGradNodeCreationContent(
     if (!inplace_map.empty() && inplace_map.count(output_name)) {
       auto inplace_input_name = inplace_map[output_name];
       const std::string& inplace_input_autograd_name =
-          "p_autograd_" + inplace_input_name;
+          "p_autograd_" + LegalizeVarName(inplace_input_name);
       size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
       // Intermediate Tensor does not require SetHistory, nor RetainGrad
@@ -1237,18 +1244,20 @@ static std::string GenerateGradNodeCreationContent(
       const char* SET_GRAD_IN_META_TEMPLATE =
           "      grad_node->SetGradInMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position);
+          SET_GRAD_IN_META_TEMPLATE, LegalizeVarName(inplace_input_name),
+          output_position);
 
       // Intermediate Tensor does not require CheckAndRetainGrad
       if (!output.intermediate()) {
         VLOG(6) << "Generated Call RetainGradForTensor";
         const char* RETAIN_GRAD_TEMPLATE =
             "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-        grad_node_creation_str +=
-            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name);
+        grad_node_creation_str += paddle::string::Sprintf(
+            RETAIN_GRAD_TEMPLATE, LegalizeVarName(inplace_input_name));
       }
     } else {
-      const std::string& output_autograd_name = "p_autograd_" + output_name;
+      const std::string& output_autograd_name =
+          "p_autograd_" + LegalizeVarName(output_name);
       size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
       // Intermediate Tensor does not require SetHistory, nor RetainGrad
@@ -1270,7 +1279,8 @@ static std::string GenerateGradNodeCreationContent(
         const char* SET_GRAD_IN_META_TEMPLATE =
             "      grad_node->SetGradInMeta(%s, %d);\n";
         grad_node_creation_str += paddle::string::Sprintf(
-            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+            SET_GRAD_IN_META_TEMPLATE, LegalizeVarName(output_name),
+            output_position);
 
       } else {
         pass_stop_gradient_args += ", " + output_autograd_name;
@@ -1289,7 +1299,8 @@ static std::string GenerateGradNodeCreationContent(
         const char* SET_GRAD_IN_META_TEMPLATE =
             "      grad_node->SetGradInMeta(%s, %d);\n";
         grad_node_creation_str += paddle::string::Sprintf(
-            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+            SET_GRAD_IN_META_TEMPLATE, LegalizeVarName(output_name),
+            output_position);
       }
 
       // Intermediate Tensor does not require CheckAndRetainGrad
@@ -1297,8 +1308,8 @@ static std::string GenerateGradNodeCreationContent(
         VLOG(6) << "Generated Call RetainGradForTensor";
         const char* RETAIN_GRAD_TEMPLATE =
             "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-        grad_node_creation_str +=
-            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
+        grad_node_creation_str += paddle::string::Sprintf(
+            RETAIN_GRAD_TEMPLATE, LegalizeVarName(output_name));
       }
     }
   }
@@ -1421,9 +1432,10 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     if (input.duplicable()) {
       const char* FWD_INS_ARG_TEMPLATE =
           "const std::vector<paddle::experimental::Tensor>& %s";
-      input_args_str_list[input_position] =
-          paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
-      amp_function_call_args_str_list[input_position] = " NEW_" + input_name;
+      input_args_str_list[input_position] = paddle::string::Sprintf(
+          FWD_INS_ARG_TEMPLATE, LegalizeVarName(input_name));
+      amp_function_call_args_str_list[input_position] =
+          " NEW_" + LegalizeVarName(input_name);
 
       core_ops_args_type_info[op_type][input_position] = "list";
     } else {
@@ -1442,9 +1454,10 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       if (!flag_find_input_name) {
         FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s";
       }
-      input_args_str_list[input_position] =
-          paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
-      amp_function_call_args_str_list[input_position] = " NEW_" + input_name;
+      input_args_str_list[input_position] = paddle::string::Sprintf(
+          FWD_INS_ARG_TEMPLATE, LegalizeVarName(input_name));
+      amp_function_call_args_str_list[input_position] =
+          " NEW_" + LegalizeVarName(input_name);
 
       core_ops_args_type_info[op_type][input_position] = "tensor";
     }
@@ -1454,8 +1467,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
     const char* FWD_INS_CONTENT_TEMPLATE =
         "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
-    ins_contents_str += paddle::string::Sprintf(FWD_INS_CONTENT_TEMPLATE,
-                                                input_name, input_name);
+    ins_contents_str += paddle::string::Sprintf(
+        FWD_INS_CONTENT_TEMPLATE, input_name, LegalizeVarName(input_name));
     if (input.duplicable()) {
       const char* AMP_TENSORS_VECTOR_TEMPLATE = "%s,";
       amp_tensors_vector_str +=
@@ -1464,16 +1477,18 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
           "    auto NEW_%s = egr::AmpAutoCasts(\"%s\", %s, amp_dst_dtype, "
           "\"%s\");\n";
       amp_auto_cast_str += paddle::string::Sprintf(
-          AMP_AUTO_CAST_TEMPLATE, input_name, input_name, input_name, op_type);
+          AMP_AUTO_CAST_TEMPLATE, LegalizeVarName(input_name), input_name,
+          LegalizeVarName(input_name), op_type);
     } else {
       const char* AMP_TENSORS_VECTOR_TEMPLATE = "{%s},";
-      amp_tensors_vector_str +=
-          paddle::string::Sprintf(AMP_TENSORS_VECTOR_TEMPLATE, input_name);
+      amp_tensors_vector_str += paddle::string::Sprintf(
+          AMP_TENSORS_VECTOR_TEMPLATE, LegalizeVarName(input_name));
       const char* AMP_AUTO_CAST_TEMPLATE =
           "    auto NEW_%s = egr::AmpAutoCast(\"%s\", %s, amp_dst_dtype, "
           "\"%s\");\n";
       amp_auto_cast_str += paddle::string::Sprintf(
-          AMP_AUTO_CAST_TEMPLATE, input_name, input_name, input_name, op_type);
+          AMP_AUTO_CAST_TEMPLATE, LegalizeVarName(input_name), input_name,
+          LegalizeVarName(input_name), op_type);
     }
   }
   if (ins_contents_str.size() > 0)
@@ -1509,35 +1524,41 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
             "  if(%s.size() > 0) "
             "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
         dispensable_ins_contents_str += paddle::string::Sprintf(
-            FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
+            FWD_INS_CONTENT_TEMPLATE, LegalizeVarName(input_name), input_name,
+            LegalizeVarName(input_name));
         const char* FWD_AMP_TENSORS_VECTOR_TEMPLATE =
             "    if(%s.size() > 0) "
             "amp_tensors_vector.push_back(%s);\n";
         dispensable_amp_tensors_vector_str += paddle::string::Sprintf(
-            FWD_AMP_TENSORS_VECTOR_TEMPLATE, input_name, input_name);
+            FWD_AMP_TENSORS_VECTOR_TEMPLATE, LegalizeVarName(input_name),
+            LegalizeVarName(input_name));
         const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
             "    auto NEW_%s = ((%s.size() > 0) ? egr::AmpAutoCasts(\"%s\", "
             "%s, amp_dst_dtype, \"%s\") : %s);\n";
         dispensable_amp_auto_cast_str += paddle::string::Sprintf(
-            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
-            input_name, input_name, op_type, input_name);
+            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, LegalizeVarName(input_name),
+            LegalizeVarName(input_name), input_name,
+            LegalizeVarName(input_name), op_type, LegalizeVarName(input_name));
       } else {
         const char* FWD_INS_CONTENT_TEMPLATE =
             "  if(%s.initialized()) "
             "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
         dispensable_ins_contents_str += paddle::string::Sprintf(
-            FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
+            FWD_INS_CONTENT_TEMPLATE, LegalizeVarName(input_name), input_name,
+            LegalizeVarName(input_name));
         const char* FWD_AMP_TENSORS_VECTOR_TEMPLATE =
             "    if(%s.initialized()) "
             "amp_tensors_vector.push_back({ %s });\n";
         dispensable_amp_tensors_vector_str += paddle::string::Sprintf(
-            FWD_AMP_TENSORS_VECTOR_TEMPLATE, input_name, input_name);
+            FWD_AMP_TENSORS_VECTOR_TEMPLATE, LegalizeVarName(input_name),
+            LegalizeVarName(input_name));
         const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
             "    auto NEW_%s = ((%s.initialized()) ? egr::AmpAutoCast(\"%s\", "
             "%s, amp_dst_dtype, \"%s\") : %s);\n";
         dispensable_amp_auto_cast_str += paddle::string::Sprintf(
-            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
-            input_name, input_name, op_type, input_name);
+            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, LegalizeVarName(input_name),
+            LegalizeVarName(input_name), input_name,
+            LegalizeVarName(input_name), op_type, LegalizeVarName(input_name));
       }
     }
   }
@@ -1559,18 +1580,18 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       if (output.duplicable()) {
         const char* FWD_NUM_ARG_TEMPLATE =
             ", std::vector<paddle::experimental::Tensor*>& %s";
-        std::string arg_str =
-            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
+        std::string arg_str = paddle::string::Sprintf(
+            FWD_NUM_ARG_TEMPLATE, LegalizeVarName(output_var_name));
         dygraph_function_args_str += arg_str;
-        amp_function_call_args_str += (", " + output_var_name);
+        amp_function_call_args_str += (", " + LegalizeVarName(output_var_name));
 
         core_ops_args_type_info[op_type].push_back("list");
       } else {
         const char* FWD_NUM_ARG_TEMPLATE = ", paddle::experimental::Tensor* %s";
-        std::string arg_str =
-            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
+        std::string arg_str = paddle::string::Sprintf(
+            FWD_NUM_ARG_TEMPLATE, LegalizeVarName(output_var_name));
         dygraph_function_args_str += arg_str;
-        amp_function_call_args_str += (", " + output_var_name);
+        amp_function_call_args_str += (", " + LegalizeVarName(output_var_name));
 
         core_ops_args_type_info[op_type].push_back("tensor");
       }
@@ -1586,8 +1607,9 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       } else {
         const char* FWD_OUTS_CONTENT_TEMPLATE =
             "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
-        outs_contents_str += paddle::string::Sprintf(
-            FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
+        outs_contents_str +=
+            paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, output_name,
+                                    LegalizeVarName(output_var_name));
       }
       core_ops_args_info[op_type].push_back(output_name);
 
@@ -1649,7 +1671,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     std::string amp_logic_str = "";
     if (in_vars.size() != 0) {
       const char* AMP_TENSORS_VECTOR_TEMPLATE =
-          "    std::vector<std::vector<paddle::experimental::Tensor>> "
+          "    paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+          "egr::kSlotSmallVectorSize> "
           "amp_tensors_vector = { "
           "%s };\n";
       std::string amp_tensors_vector = paddle::string::Sprintf(
@@ -1781,7 +1804,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   std::vector<std::string> return_types(output_size);
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
-    const std::string output_var_args_name = output_name + "Var";
+    const std::string output_var_args_name =
+        LegalizeVariableName(output_name + "Var");
     std::string out_tensor_str;
     size_t return_position = fwd_outputs_name_pos_map.at(output_name);
     std::string output_varname = LegalizeVariableName(output_name);
@@ -1845,9 +1869,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               "  %s.bump_inplace_version();\n"
               "  VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace "
               "Strategy.\";\n";
-          out_tensor_str = paddle::string::Sprintf(
-              FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name,
-              inplace_input_name, inplace_input_name);
+          out_tensor_str =
+              paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_name,
+                                      LegalizeVarName(inplace_input_name),
+                                      LegalizeVarName(inplace_input_name),
+                                      LegalizeVarName(inplace_input_name));
         } else {
           const char* FWD_OUT_TENSOR_TEMPLATE =
               "  paddle::experimental::Tensor %s;\n"
@@ -1862,7 +1888,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
     if (!inplace_map.empty() && inplace_map.count(output_name)) {
       // Replace output directly with input in inplace op.
-      return_contents[return_position] = inplace_map[output_name];
+      return_contents[return_position] =
+          LegalizeVarName(inplace_map[output_name]);
     } else {
       return_contents[return_position] = output_varname;
     }
@@ -2428,9 +2455,11 @@ static std::string GenerateGradNodeCCContents(
   }
 
   const char* BWD_RETURN_TEMPLATE =
-      "  std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads = "
+      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> hooked_grads = "
       "GradNode%s::ApplyGradientHooks(grads);\n"
-      "  std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
+      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> outputs(%d);\n"
       "  %s\n"
       "  if(NeedComplexToRealConversion()) "
       "HandleComplexGradToRealGrad(&outputs);\n"
@@ -2441,10 +2470,12 @@ static std::string GenerateGradNodeCCContents(
 
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
-      "std::vector<std::vector<paddle::experimental::Tensor>> "
+      "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> "
       "GradNode%s::operator()("
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
-      "create_graph) {\n"
+      "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize>& grads, bool "
+      "create_graph, bool is_new_grad) {\n"
       "%s"
       "%s"
       "\n}";
@@ -2487,10 +2518,13 @@ static std::string GenerateGradNodeHeaderContents(
       "Construct GradNode%s \"; }\n"
       "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
       "\n"
-      "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
+      "  virtual "
+      "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> "
       "operator()("
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
-      "create_graph = false) "
+      "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize>& grads, bool "
+      "create_graph = false, bool is_new_grad = false) "
       "override;\n"
       "\n"
       "  void ClearTensorWrappers() override { \n"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 7769c5371baba..8c98d9fa275dc 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -22,17 +22,12 @@
 ### Global Variables ###
 ########################
 ops_to_fill_zero_for_empty_grads = set([
-    "split_grad",
-    "rnn_grad",
-    "matmul_double_grad",
-    "matmul_triple_grad",
-    "sigmoid_double_grad",
-    "sigmoid_triple_grad",
-    "add_double_grad",
-    "add_triple_grad",
-    "multiply_double_grad",
-    "multiply_triple_grad",
-    "conv2d_grad_grad",
+    "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad",
+    "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad",
+    "add_triple_grad", "multiply_double_grad", "multiply_triple_grad",
+    "conv2d_grad_grad", "batch_norm_double_grad", "tanh_double_grad",
+    "tanh_triple_grad", "subtract_double_grad", "divide_double_grad",
+    "log_double_grad", "elu_double_grad", "leaky_relu_double_grad"
 ])
 
 # For API dispatch used at python-level
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 54c6e39283ec5..00b9aa7a231a3 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -118,8 +118,8 @@ class {} : public egr::GradNodeBase {{
       egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {{}}
   ~{}() override = default;
 
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize>& grads, bool create_graph = false, bool is_new_grad = false) override;
   std::string name() override {{ return \"{}\"; }}
   
   void ClearTensorWrappers() override {{
@@ -149,7 +149,7 @@ class {} : public egr::GradNodeBase {{
 
 GRAD_FUNCTION_TEMPLATE = \
 """
-std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
+paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> {}::operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize>& grads, bool create_graph, bool is_new_grad) {{
     // Fill Zero For GradIn Tensors
 {}
 
@@ -239,7 +239,6 @@ class {} : public egr::GradNodeBase {{
       // Set TensorWrappers for Forward Inputs
 {}
       // SetGradOutMeta & SetEdges
-{}
 {}
       // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
 {}
@@ -356,7 +355,7 @@ class {} : public egr::GradNodeBase {{
     if (egr::Controller::Instance().GetAMPLevel() != paddle::imperative::AmpLevel::O0) {{
         VLOG(5) << "Check and Prepare For AMP";
         {}
-        std::vector<std::vector<paddle::experimental::Tensor>> amp_tensors_vector = {};
+        paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> amp_tensors_vector = {};
         {}
         {}
         {}
@@ -769,15 +768,11 @@ def GenerateNodeCreationCodes(self):
             is_optional = (name in self.optional_inputs)
             if is_optional:
                 set_grad_out_meta = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetGradOutMeta(*({name}.get_ptr()), {pos});"
-                set_edges = f"{indent}if({name}.get_ptr() != nullptr)  grad_node->AddEdges({input_autograd_meta_name}, {pos});"
             else:
                 set_grad_out_meta = f"{indent}grad_node->SetGradOutMeta({name}, {pos});"
-                set_edges = f"{indent}grad_node->AddEdges({input_autograd_meta_name}, {pos});"
 
             set_grad_out_meta_list.append(set_grad_out_meta)
-            set_edges_list.append(set_edges)
         set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
-        set_edges_str = "\n".join(set_edges_list)
 
         # SetOutRank & SetHistory & SetGradInMeta
         set_out_rank_list = []
@@ -808,7 +803,7 @@ def GenerateNodeCreationCodes(self):
         self.node_creation_str = FORWARD_BODY_TEMPLATE.format(
             node_creation_event_str, pass_stop_gradient_args_str,
             node_construction_str, set_attributes_str,
-            set_input_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str,
+            set_input_tensor_wrappers_str, set_grad_out_meta_str,
             set_out_rank_str, set_history_str, set_grad_in_meta_str,
             set_retain_grad_str, set_output_tensor_wrappers_str)
 
@@ -1454,7 +1449,7 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
 
         # Construct grad_api returns
         slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
-        returns_str = f"{indent}std::vector<std::vector<paddle::experimental::Tensor>> returns({slot_num_bwd_outputs});\n"
+        returns_str = f"{indent}paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});\n"
         for name, (ttype, fwd_position,
                    grad_api_position) in backward_grad_outputs_map.items():
             transformed_tensor_name = self.TransformToNextGradName(name)
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index a1df822265309..7a4e7f81611d1 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -169,9 +169,12 @@ class GeneralGrad {
           input_target_nodes_inputmeta_map.count(node);
 
       // Find and append next nodes
-      const std::vector<std::vector<Edge>>& edges = node->GetEdges();
-      for (const auto& edge_list : edges) {
-        for (const Edge& edge : edge_list) {
+      const paddle::small_vector<std::vector<GradSlotMeta>,
+                                 kSlotSmallVectorSize>& metas =
+          node->OutputMeta();
+      for (const auto& meta_list : metas) {
+        for (const GradSlotMeta& meta : meta_list) {
+          const auto& edge = meta.GetEdge();
           GradNodeBase* next_node = edge.GetMutableGradNode().get();
 
           // Next node could be nullptr if it is leaf tensor with no
@@ -381,13 +384,15 @@ class GeneralGrad {
               "unable to find copied target for certain grad node."));
       GradNodeBase* copied_node = orig_to_copied_node_mapping_[orig_node].get();
 
-      const std::vector<std::vector<Edge>>& orig_edges = orig_node->GetEdges();
-      std::vector<std::vector<Edge>>& copied_edges =
-          copied_node->GetMutableEdges();
-      for (size_t i = 0; i < orig_edges.size(); i++) {
-        for (size_t j = 0; j < orig_edges[i].size(); j++) {
-          const Edge& orig_edge = orig_edges[i][j];
-          Edge& copied_edge = copied_edges[i][j];
+      const paddle::small_vector<std::vector<GradSlotMeta>,
+                                 kSlotSmallVectorSize>& orig_meta =
+          orig_node->OutputMeta();
+      paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+          copied_edges = copied_node->MutableOutputMeta();
+      for (size_t i = 0; i < orig_meta.size(); i++) {
+        for (size_t j = 0; j < orig_meta[i].size(); j++) {
+          const Edge& orig_edge = orig_meta[i][j].GetEdge();
+          Edge& copied_edge = copied_edges[i][j].GetMutableEdge();
 
           std::shared_ptr<GradNodeBase> orig_next_node =
               orig_edge.GetMutableGradNode();
@@ -468,9 +473,11 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
             "We got null node when we traverse the backward graph, and this "
             "should not happened please check your code and contact us."));
     // Find and append next nodes
-    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
-    for (const auto& edge_list : edges) {
-      for (const Edge& edge : edge_list) {
+    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+        metas = node->OutputMeta();
+    for (const auto& meta_list : metas) {
+      for (const GradSlotMeta& meta : meta_list) {
+        const auto& edge = meta.GetEdge();
         GradNodeBase* next_node = edge.GetMutableGradNode().get();
         // Next node could be nullptr if it is leaf tensor with no
         // AccumulationNode attached
@@ -546,7 +553,13 @@ std::vector<paddle::experimental::Tensor> RunBackward(
   for (size_t i = 0; i < tensors.size(); i++) {
     const paddle::experimental::Tensor& tensor = tensors[i];
 
-    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(tensor);
+    AutogradMeta* auto_grad_meta = EagerUtils::nullable_autograd_meta(tensor);
+    if (auto_grad_meta == nullptr) {
+      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+                 "stop_gradient=True: "
+              << tensor.name();
+      continue;
+    }
     // Get grad input info from target tensors
     auto input_info = auto_grad_meta->OutRankInfo();
 
@@ -689,8 +702,10 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
     VLOG(6) << "Run Backward Kernel with GradTensorHolder.";
     // Run Pre Backward Node and get outputs
-    std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
-        (*node)(node_input_buffer->Buffers(), create_graph);
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>
+        grad_output_tensors = (*node)(node_input_buffer->Buffers(),
+                                      create_graph, is_general_grad);
 
     // retain_grad or not
     if (!retain_graph) {
@@ -704,17 +719,18 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     node_input_buffers_dict.erase(node);
 
     // Prepare GradTensorHolder for next node
-    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
-    PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(),
+    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+        metas = node->OutputMeta();
+    PADDLE_ENFORCE(metas.size() == grad_output_tensors.size() || metas.empty(),
                    paddle::platform::errors::Fatal(
                        "Number of edges should be either empty ( for leaf node "
                        ") or the same as number of output grad tensors, but we "
                        "got edges size is: %d, grad_output size is: %d",
-                       edges.size(), grad_output_tensors.size()));
+                       metas.size(), grad_output_tensors.size()));
 
-    for (size_t i = 0; i < edges.size(); i++) {
-      for (size_t j = 0; j < edges[i].size(); j++) {
-        const Edge& edge = edges[i][j];
+    for (size_t i = 0; i < metas.size(); i++) {
+      for (size_t j = 0; j < metas[i].size(); j++) {
+        const Edge& edge = metas[i][j].GetEdge();
         if (!edge.IsInitialized()) {
           continue;
         }
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 08ca3bed5a653..2bb86a86e8348 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -19,9 +19,12 @@
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace egr {
-std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
-operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads,
-           bool create_graph) {  // NOLINT
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+RunCustomOpNode::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,
+    bool create_graph, bool is_new_grad) {  // NOLINT
   paddle::CustomOpKernelContext ctx;
   auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
       egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
@@ -30,8 +33,9 @@ operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads,
   auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
   auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
 
-  std::vector<std::vector<paddle::experimental::Tensor>> tmp_ins(
-      grad_inputs_name.size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_ins(grad_inputs_name.size());
   VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
           << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
   for (size_t i = 0; i < grads.size(); i++) {
@@ -57,17 +61,19 @@ operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads,
   }
   VLOG(6) << "Prepare Grad attrs";
   ctx.EmplaceBackAttrs(attrs_);
-  std::vector<std::vector<paddle::experimental::Tensor>> outs(
-      GetEdges().size());
-  std::vector<std::vector<paddle::experimental::Tensor>> tmp_outs(
-      grad_outputs_names.size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs(OutputMeta().size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_outs(grad_outputs_names.size());
   VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
-  for (size_t i = 0; i < GetEdges().size(); i++) {
+  for (size_t i = 0; i < OutputMeta().size(); i++) {
     if (map[0].find(i) != map[0].end()) {
       VLOG(7) << "Insert grad outputs: " << i
-              << " with size: " << GetEdges()[i].size()
+              << " with size: " << OutputMeta()[i].size()
               << " to tmp_outputs: " << map[0][i];
-      for (size_t j = 0; j < GetEdges()[i].size(); j++) {
+      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
         outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
                              std::make_shared<phi::DenseTensor>(
                                  phi::DataType::UNDEFINED),
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
index 6db410fa0f1af..4801088e51ba5 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.h
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -36,10 +36,13 @@ class RunCustomOpNode : public GradNodeBase {
   }
 
   // Functor: perform backward computations
-  virtual std::vector<std::vector<paddle::experimental::Tensor>>
-  operator()(                                                         // NOLINT
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false)                                      // NOLINT
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(  // NOLINT
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>& grads,  // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false)  // NOLINT
       override;
 
   std::string name() {
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 5b4921320f6b0..610b177829e2f 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -40,70 +40,20 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
   VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
-  adj_edges_.resize(bwd_out_slot_num);
 }
 
-void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
-  PADDLE_ENFORCE_LT(
-      slot_id, adj_edges_.size(),
-      paddle::platform::errors::InvalidArgument(
-          "Given slot id is out of range of adj_edges outter size, "
-          "adj_edges is designed to has the same size of grad "
-          "inputs's slot num."));
-
-  for (size_t i = 0; i < metas->size(); i++) {
-    const auto& meta = (*metas)[i];
-    // adj_edges has as same rank as fwd inputs, and record it's output rank
-    // from
-    // its pre-ops
-    if (meta && !meta->StopGradient()) {
-      auto node = meta->GetMutableGradNode();
-      if (!node || !node.get()) {
-        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-      }
-      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-              << this->name() << " (addr: " << this << ") "
-              << " to " << meta->GetMutableGradNode()->name()
-              << " (addr: " << meta->GetMutableGradNode().get() << ")";
-
-      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                       meta->OutRankInfo());
-    } else {
-      adj_edges_[slot_id].emplace_back();
-    }
-  }
-}
-
-void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
-  PADDLE_ENFORCE_LT(
-      slot_id, adj_edges_.size(),
-      paddle::platform::errors::InvalidArgument(
-          "Given slot id is out of range of adj_edges outter size, "
-          "adj_edges is designed to has the same size of grad "
-          "inputs's slot num."));
-
-  if (meta && !meta->StopGradient()) {
-    auto node = meta->GetMutableGradNode();
-    if (!node || !node.get()) {
-      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-    }
-    VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-            << this->name() << " (addr: " << this << ") "
-            << " to " << meta->GetMutableGradNode()->name()
-            << " (addr: " << meta->GetMutableGradNode().get() << ")";
-
-    adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                     meta->OutRankInfo());
-  } else {
-    adj_edges_[slot_id].emplace_back();
-  }
+const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+GradNodeBase::InputMeta() const {
+  return bwd_in_meta_;
 }
 
-const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::InputMeta() const {
-  return bwd_in_meta_;
+const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+GradNodeBase::OutputMeta() const {
+  return bwd_out_meta_;
 }
 
-const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::OutputMeta() const {
+paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+GradNodeBase::MutableOutputMeta() {
   return bwd_out_meta_;
 }
 
@@ -123,7 +73,9 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
   }
 
   auto& meta = metas[0];
-  meta.SetStopGradient(fwd_out_meta->StopGradient());
+  if (fwd_out_meta && fwd_out_meta->StopGradient()) {
+    meta.SetStopGradient(fwd_out_meta->StopGradient());
+  }
 
   if (!fwd_out.initialized()) {
     VLOG(6)
@@ -153,8 +105,8 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
   meta.SetTensorMeta(dense_tensor->meta());
   meta.SetPlace(fwd_out.place());
 
-  if (paddle::framework::IsComplexType(
-          paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+  if (dense_tensor->type() == paddle::experimental::DataType::COMPLEX64 ||
+      dense_tensor->type() == paddle::experimental::DataType::COMPLEX128) {
     need_complex_to_real_ = true;
   }
 }
@@ -186,7 +138,7 @@ void GradNodeBase::SetGradInMeta(
                                 "Bwd_in_meta should only be called while "
                                 "autograd_meta is not null. If you got this "
                                 "error, it indicates bugs in framework."));
-    if (fwd_out_meta->StopGradient()) {
+    if (fwd_out_meta && fwd_out_meta->StopGradient()) {
       // Set Stop Gradient only when its true or non-initialized autograd_meta,
       // since all default value is false.
       meta.SetStopGradient(fwd_out_meta->StopGradient());
@@ -212,8 +164,8 @@ void GradNodeBase::SetGradInMeta(
       meta.SetTensorMeta(dense_tensor->meta());
       meta.SetPlace(fwd_out_tensor.place());
 
-      if (paddle::framework::IsComplexType(
-              paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+      if (dense_tensor->type() == paddle::experimental::DataType::COMPLEX64 ||
+          dense_tensor->type() == paddle::experimental::DataType::COMPLEX128) {
         need_complex_to_real_ = true;
       }
     } else {
@@ -238,12 +190,24 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
     metas.resize(1);
   }
   auto& meta = metas[0];
+  // Set Stop_gradient
   if (fwd_in_meta) {
     meta.SetStopGradient(fwd_in_meta->StopGradient());
-  } else {
-    meta.SetStopGradient(true);
   }
+  // Set Adj Edges
+  if (fwd_in_meta && !fwd_in_meta->StopGradient()) {
+    auto node = fwd_in_meta->GetMutableGradNode();
+    if (!node || !node.get()) {
+      fwd_in_meta->SetGradNode(
+          std::make_shared<egr::GradNodeAccumulation>(fwd_in_meta));
+    }
+    VLOG(6) << "Add Edges for slot: " << slot_rank << ", the Edge is from "
+            << this->name() << " (addr: " << this << ") "
+            << " to " << fwd_in_meta->GetMutableGradNode()->name()
+            << " (addr: " << fwd_in_meta->GetMutableGradNode().get() << ")";
 
+    meta.SetEdge(fwd_in_meta->GetMutableGradNode(), fwd_in_meta->OutRankInfo());
+  }
   // Record TensorMeta
   if (fwd_in.impl() && fwd_in.impl().get()) {
     if (phi::DenseTensor::classof(fwd_in.impl().get())) {
@@ -282,30 +246,43 @@ void GradNodeBase::SetGradOutMeta(
     const auto& fwd_in_tensor = fwd_in[i];
     auto& meta = metas[i];
     auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in_tensor);
+    // Set Stop_gradient
     if (fwd_in_meta) {
-      // Set Stop Gradient only when its true or non-initialized autograd_meta,
-      // since all default value is false.
       meta.SetStopGradient(fwd_in_meta->StopGradient());
     }
+    // Set Adj Edges
+    if (fwd_in_meta && !fwd_in_meta->StopGradient()) {
+      auto node = fwd_in_meta->GetMutableGradNode();
+      if (!node || !node.get()) {
+        fwd_in_meta->SetGradNode(
+            std::make_shared<egr::GradNodeAccumulation>(fwd_in_meta));
+      }
+      VLOG(6) << "Add Edges for slot: " << slot_rank << ", the Edge is from "
+              << this->name() << " (addr: " << this << ") "
+              << " to " << fwd_in_meta->GetMutableGradNode()->name()
+              << " (addr: " << fwd_in_meta->GetMutableGradNode().get() << ")";
 
+      meta.SetEdge(fwd_in_meta->GetMutableGradNode(),
+                   fwd_in_meta->OutRankInfo());
+    }
     // Record TensorMeta
     if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) {
       if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) {
         // Only Copy Meta
         phi::DenseTensor* dense_tensor =
             static_cast<phi::DenseTensor*>(fwd_in_tensor.impl().get());
-
         PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
                           paddle::platform::errors::Fatal(
-                              "Attempting to copy DenseTensorMeta with "
-                              "phi::DataType::UNDEFINED,"
+                              "Attempting to copy DenseTensorMeta "
+                              "with phi::DataType::UNDEFINED,"
                               "which is illegal."));
         meta.SetTensorMeta(dense_tensor->meta());
         meta.SetPlace(fwd_in_tensor.place());
       }
     } else {
-      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
-                 "with non-DenseTensor argument.";
+      VLOG(6)
+          << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+             "non-DenseTensor argument.";
     }
   }
 }
@@ -328,18 +305,14 @@ int64_t GradNodeBase::RegisterGradientHook(
   return next_hook_id_++;
 }
 
-const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
-  return adj_edges_;
-}
-
-std::vector<std::vector<Edge>>& GradNodeBase::GetMutableEdges() {
-  return adj_edges_;
-}
-
-std::vector<std::vector<paddle::experimental::Tensor>>
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
 GradNodeBase::ApplyGradientHooks(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& tensors) {
-  std::vector<std::vector<paddle::experimental::Tensor>> outs(tensors.size());
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>& tensors) {
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs(tensors.size());
   for (auto& hook_pair : gradient_hooks_) {
     size_t slot_id = std::get<0>(hook_pair.second);
     size_t rank = std::get<1>(hook_pair.second);
@@ -386,7 +359,8 @@ GradNodeBase::ApplyGradientHooks(
 }
 
 void GradNodeBase::HandleComplexGradToRealGrad(
-    std::vector<std::vector<paddle::experimental::Tensor>>* out_grads) {
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>* out_grads) {
   for (size_t slot_id = 0; slot_id < out_grads->size(); slot_id++) {
     const std::vector<paddle::experimental::Tensor>& slot_out_grads =
         (*out_grads)[slot_id];
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 201aae294f928..6fdee203c196c 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/api/all.h"
@@ -46,9 +47,8 @@ namespace egr {
  * indicate which
  * input of grad this edge belong).
  * */
-class Edge;
 class AutogradMeta;
-
+class GradNodeBase;
 /**
  * GradSlotMeta is used to Record Forward Tensor info to backward, since paddle
  * has lots of operators
@@ -56,6 +56,70 @@ class AutogradMeta;
  * So, we need a meta info
  * to record it's needs.
  * **/
+class Edge {
+ public:
+  // Default constructor for Edges in order to construct it for AutogradMeta
+  Edge() : in_slot_id_(0), in_rank_(0), grad_node_(nullptr) {}
+
+  // In real use cases we should create Edge from grad node and input rank which
+  // indicate which edge it is.
+  // Since we have slot design in operators we will have to locate an edge with
+  // slot
+  // and rank.
+  Edge(const std::shared_ptr<GradNodeBase>& grad_node, size_t in_slot_id,
+       size_t in_rank)
+      : in_slot_id_(in_slot_id), in_rank_(in_rank), grad_node_(grad_node) {}
+
+  Edge(const std::shared_ptr<GradNodeBase>& grad_node,
+       const std::pair</* slot_id */ size_t, /* rank */ size_t>& rank_info)
+      : in_slot_id_(rank_info.first),
+        in_rank_(rank_info.second),
+        grad_node_(grad_node) {}
+
+  GradNodeBase* GetGradNode() const { return grad_node_.get(); }
+
+  std::shared_ptr<GradNodeBase> GetMutableGradNode() const {
+    return grad_node_;
+  }
+
+  void SetGradNode(const std::shared_ptr<GradNodeBase>& node) {
+    VLOG(6) << "Reseting Edge's Grad Node";
+    grad_node_ = node;
+  }
+
+  std::pair<size_t, size_t> GetEdgeRankInfo() const {
+    return std::make_pair(in_slot_id_, in_rank_);
+  }
+
+  void SetEdgeRankInfo(size_t slot_id, size_t in_rank) {
+    in_slot_id_ = slot_id;
+    in_rank_ = in_rank;
+  }
+
+  void SetEdgeRankInfo(
+      const std::pair</* slot_id */ size_t, /* rank */ size_t>& edge_rank) {
+    in_slot_id_ = edge_rank.first;
+    in_rank_ = edge_rank.second;
+  }
+
+  // Currently we use grad_node_ to identify if a edge is initialized.
+  bool IsInitialized() const {
+    if (!grad_node_) {
+      return false;
+    } else {
+      if (!(grad_node_.get())) {
+        return false;
+      } else {
+        return true;
+      }
+    }
+  }
+
+ private:
+  size_t in_slot_id_;
+  size_t in_rank_;
+  std::shared_ptr<GradNodeBase> grad_node_{nullptr};
+};
 class GradSlotMeta {
  public:
   GradSlotMeta() = default;
@@ -81,10 +145,21 @@ class GradSlotMeta {
   void SetPlace(const phi::Place& place) { place_ = place; }
   const phi::Place& GetPlace() const { return place_; }
 
+  void SetEdge(const Edge& edge) { adj_edge_ = edge; }
+  void SetEdge(
+      const std::shared_ptr<GradNodeBase>& grad_node,
+      const std::pair</* slot_id */ size_t, /* rank */ size_t>& rank_info) {
+    adj_edge_.SetGradNode(grad_node);
+    adj_edge_.SetEdgeRankInfo(rank_info);
+  }
+  Edge& GetMutableEdge() { return adj_edge_; }
+  const Edge& GetEdge() const { return adj_edge_; }
+
  private:
   bool stop_gradient_{false};
   phi::Place place_;
   std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr;
+  Edge adj_edge_;
 };
 
 class GradNodeBase {
@@ -107,9 +182,12 @@ class GradNodeBase {
    * so, vector of vector
    * is better choice to fit this format.
    * **/
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false) = 0;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) = 0;
 
   virtual void ClearTensorWrappers() = 0;
 
@@ -118,17 +196,6 @@ class GradNodeBase {
        * **/
   virtual std::shared_ptr<GradNodeBase> Copy() const = 0;
 
-  /**
-   * AddEdges is designed to set input tensors' backward Node as current
-   * node's Edges.
-   * This method should be call in forward code and for double backward depends
-   * computation.
-   *
-   * This one is called slot by slot
-   * **/
-  void AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id);
-  void AddEdges(AutogradMeta* meta, size_t slot_id);
-
   // adj_edges were moved inside OutputMeta(), so no available direct access
   // from GradNodeBase.
   // To access Edges, get GradSlotMeta by calling OutputMeta(), then use
@@ -136,10 +203,15 @@ class GradNodeBase {
 
   /**
    * Get Input Meta of current Grad node**/
-  const std::vector<std::vector<GradSlotMeta>>& InputMeta() const;
+  const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+  InputMeta() const;
   /**
    * Get Output Meta of current Grad node**/
-  const std::vector<std::vector<GradSlotMeta>>& OutputMeta() const;
+  const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+  OutputMeta() const;
+
+  paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+  MutableOutputMeta();
   /**
    * Set bwd ins and outs info with forward vars
    * **/
@@ -180,23 +252,22 @@ class GradNodeBase {
    * **/
   inline bool GradientHooksRegistered() { return !gradient_hooks_.empty(); }
 
-  std::vector<std::vector<paddle::experimental::Tensor>> ApplyGradientHooks(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& tensors);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+  ApplyGradientHooks(
+      const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                 kSlotSmallVectorSize>& tensors);
 
   /**
     * Handle Complex - Real Type Promotion
     * **/
   void HandleComplexGradToRealGrad(
-      std::vector<std::vector<paddle::experimental::Tensor>>* out_grads);
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>* out_grads);
   bool NeedComplexToRealConversion() { return need_complex_to_real_; }
 
   virtual std::string name() { return "GradNodeBase"; }
 
-  /**
-       * GetEdges is designed to get all edges of current node**/
-  const std::vector<std::vector<Edge>>& GetEdges() const;
-  std::vector<std::vector<Edge>>& GetMutableEdges();
-
   /**
        * The following interfaces are designed for no_need_buffer
        * **/
@@ -207,18 +278,13 @@ class GradNodeBase {
   }
 
  private:
-  // TODO(zhanlve): Merge adj_edges_ into GradOutMeta
-  // Edges recorded the backward related node info, which indicate all edges
-  // linked
-  // by this Grad Node.
-  // Why we need vector<vector<Edge>>: Edges is as same rank as bwd output.
-  std::vector<std::vector<Edge>> adj_edges_;
-
   // bwd_out_meta_ is used to record Grad output info for backward
-  std::vector<std::vector<GradSlotMeta>> bwd_out_meta_;
+  paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>
+      bwd_out_meta_;
 
   // bwd_in_meta_ used to record Grad input info for backward
-  std::vector<std::vector<GradSlotMeta>> bwd_in_meta_;
+  paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>
+      bwd_in_meta_;
   // Gradient Hooks
   // Customer may register a list of hooks which will be called in order during
   // backward
@@ -235,71 +301,6 @@ class GradNodeBase {
   bool is_tensor_wrappers_cleared_ = false;
 };
 
-class Edge {
- public:
-  // Default constructor for Edges in order to construct it for AutogradMeta
-  Edge() : in_slot_id_(0), in_rank_(0), grad_node_(nullptr) {}
-
-  // In real use cases we should create Edge from grad node and input rank which
-  // indicate which edge it is.
-  // Since we have slot design in operators we will have to locate an edge with
-  // slot
-  // and rank.
-  Edge(const std::shared_ptr<GradNodeBase>& grad_node, size_t in_slot_id,
-       size_t in_rank)
-      : in_slot_id_(in_slot_id), in_rank_(in_rank), grad_node_(grad_node) {}
-
-  Edge(const std::shared_ptr<GradNodeBase>& grad_node,
-       const std::pair</* slot_id */ size_t, /* rank */ size_t>& rank_info)
-      : in_slot_id_(rank_info.first),
-        in_rank_(rank_info.second),
-        grad_node_(grad_node) {}
-
-  GradNodeBase* GetGradNode() const { return grad_node_.get(); }
-
-  std::shared_ptr<GradNodeBase> GetMutableGradNode() const {
-    return grad_node_;
-  }
-
-  void SetGradNode(const std::shared_ptr<GradNodeBase>& node) {
-    VLOG(6) << "Reseting Edge's Grad Node";
-    grad_node_ = node;
-  }
-
-  std::pair<size_t, size_t> GetEdgeRankInfo() const {
-    return std::make_pair(in_slot_id_, in_rank_);
-  }
-
-  void SetEdgeRankInfo(size_t slot_id, size_t in_rank) {
-    in_slot_id_ = slot_id;
-    in_rank_ = in_rank;
-  }
-
-  void SetEdgeRankInfo(
-      const std::pair</* slot_id */ size_t, /* rank */ size_t>& edge_rank) {
-    in_slot_id_ = edge_rank.first;
-    in_rank_ = edge_rank.second;
-  }
-
-  // Currently we use grad_node_ to identify if a edge is initialized.
-  bool IsInitialized() const {
-    if (!grad_node_) {
-      return false;
-    } else {
-      if (!(grad_node_.get())) {
-        return false;
-      } else {
-        return true;
-      }
-    }
-  }
-
- private:
-  size_t in_slot_id_;
-  size_t in_rank_;
-  std::shared_ptr<GradNodeBase> grad_node_{nullptr};
-};
-
 inline void CheckTensor(const paddle::experimental::Tensor& pre,
                         const paddle::experimental::Tensor& post) {
   if (!pre.initialized() && post.initialized()) {
diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h
index 80b7c59df8fa0..a9800afc626c9 100644
--- a/paddle/fluid/eager/grad_tensor_holder.h
+++ b/paddle/fluid/eager/grad_tensor_holder.h
@@ -27,7 +27,8 @@ namespace egr {
 class GradTensorHolder {
  public:
   explicit GradTensorHolder(
-      const std::vector<std::vector<GradSlotMeta>>& metas) {
+      const paddle::small_vector<std::vector<GradSlotMeta>,
+                                 kSlotSmallVectorSize>& metas) {
     VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size();
     buffer_.resize(metas.size());
     for (size_t i = 0; i < buffer_.size(); i++) {
@@ -39,7 +40,8 @@ class GradTensorHolder {
   GradTensorHolder(const GradTensorHolder& other) = default;
 
   explicit GradTensorHolder(
-      std::vector<std::vector<paddle::experimental::Tensor>>&& inputs)
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>&& inputs)
       : buffer_(std::move(inputs)) {}
 
   GradTensorHolder& operator=(const GradTensorHolder& other) = default;
@@ -56,14 +58,18 @@ class GradTensorHolder {
     return buffer_[pos];
   }
 
-  std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() {
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>&
+  Buffers() {
     return buffer_;
   }
 
   void SetBufferSlotRankZeros(size_t slot_id, size_t rank);
 
  private:
-  std::vector<std::vector<paddle::experimental::Tensor>> buffer_;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      buffer_;
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc
index 42036a28cfa15..fad4fd50a5e3e 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.cc
+++ b/paddle/fluid/eager/pylayer/py_layer_node.cc
@@ -29,14 +29,18 @@
 #include "pybind11/pytypes.h"
 
 namespace egr {
-std::vector<std::vector<paddle::experimental::Tensor>> GradNodePyLayer::
-operator()(
-    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-    bool create_graph) {
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+GradNodePyLayer::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,  // NOLINT
+    bool create_graph,
+    bool is_new_grad) {
   VLOG(3) << "Running Eager Backward Node: " << name();
 
-  std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
-      GradNodePyLayer::ApplyGradientHooks(grads);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      hooked_grads = GradNodePyLayer::ApplyGradientHooks(grads);
 
   paddle::pybind::PyLayerObject* ctx =
       reinterpret_cast<paddle::pybind::PyLayerObject*>(ctx_);
@@ -124,7 +128,9 @@ operator()(
         ctx->forward_input_tensor_is_duplicable.size(), outputs_size));
   }
 
-  std::vector<std::vector<paddle::experimental::Tensor>> grad_out;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      grad_out;
   grad_out.reserve(ctx->forward_input_tensor_is_duplicable.size());
   for (size_t i = 0; i < ctx->forward_input_tensor_is_duplicable.size(); i++) {
     if (i < outputs_size) {
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h
index 87e8acf88a694..b477d7a9ad996 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.h
+++ b/paddle/fluid/eager/pylayer/py_layer_node.h
@@ -32,11 +32,17 @@ class GradNodePyLayer : public GradNodeBase {
     ctx_ = ctx;
   }
 
-  ~GradNodePyLayer() override { Py_DECREF(ctx_); };
+  ~GradNodePyLayer() override {
+    Py_DECREF(ctx_);
+    Py_XDECREF(outputs_);
+  };
 
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false) override;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 3ee1603a53ab4..f13fcfa990057 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -88,7 +88,7 @@ class TensorWrapper {
     } else {
       intermidiate_tensor_.set_impl(tensor.impl());
     }
-
+    // TODO(jiabin): This may has server performance issue
     intermidiate_tensor_.set_name(tensor.name() + "@Saved");
 
     auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index 6c6c7fd25e5e5..f9f00749dc87b 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -80,14 +80,18 @@ TEST(AccumulationNode, Tensor) {
   grad_meta->SetStopGradient(false);
 
   // operator()
-  std::vector<std::vector<paddle::experimental::Tensor>> et0_vec = {{et0}};
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      et0_vec = {{et0}};
   paddle::experimental::Tensor ret_et0 = node->operator()(et0_vec)[0][0];
   auto* ret_et0_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
           ->data<paddle::platform::float16>();
   CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));
 
-  std::vector<std::vector<paddle::experimental::Tensor>> et1_vec = {{et1}};
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      et1_vec = {{et1}};
   paddle::experimental::Tensor ret_et1 = node->operator()(et1_vec)[0][0];
 
   auto* ret_et1_ptr =
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index d592b5ccf66ff..6687b6621ad54 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -34,7 +34,9 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
   auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(
       /* val */ 5.0, /* in_num */ 2, /* out_num */ 2);
   auto grad_test_node1 = std::make_shared<eager_test::GradTestNode>();
-  std::vector<std::vector<paddle::experimental::Tensor>> grads;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      grads;
   phi::DenseTensorMeta meta =
       phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
@@ -51,28 +53,9 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
   CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(res[0][0].impl())
                ->data<float>()[0],
            6.0f);
-  VLOG(6) << "Test Add Edges";
-  egr::Edge tmp_edge0(grad_test_node1, 1, 2);
-  auto auto_grad0 = std::make_shared<egr::AutogradMeta>(tmp_edge0);
-  auto_grad0->SetStopGradient(false);
-
   egr::Edge tmp_edge1(grad_test_node1, 3, 4);
   auto auto_grad1 = std::make_shared<egr::AutogradMeta>(tmp_edge1);
   et1.set_autograd_meta(auto_grad1);
-  auto_grad1->SetStopGradient(false);
-  grad_test_node0->AddEdges(auto_grad0.get(), 0);
-
-  CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first,
-           size_t(1));
-  CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second,
-           size_t(2));
-  std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
-
-  grad_test_node0->AddEdges(&metas, 1);
-  CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first,
-           size_t(3));
-  CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().second,
-           size_t(4));
 
   VLOG(6) << "Test Set Meta and Get Meta";
   auto_grad1->SetStopGradient(true);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 8500ec79ef9ba..a00e629d1029a 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -31,9 +31,12 @@ class GradTestNode : public egr::GradNodeBase {
       : GradNodeBase(in_num, out_num), val_(val) {}
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
   std::string name() override { return "GradTestNode"; }
-  std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false) override {
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  egr::kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) override {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
     phi::DenseTensorMeta meta =
@@ -46,7 +49,9 @@ class GradTestNode : public egr::GradNodeBase {
     auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
     dt_ptr[0] = 6.0f;
     paddle::experimental::Tensor et1(dt);
-    std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}};
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        res = {{et1}};
     return res;
   }
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 7d2aafc63628e..0fe349294b438 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -45,7 +45,9 @@ TEST(GradTensorHolder, Constructor) {
       meta);
   paddle::experimental::Tensor et = paddle::experimental::Tensor(dt);
 
-  std::vector<std::vector<paddle::experimental::Tensor>> inputs;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      inputs;
   inputs.push_back({et});
 
   GradTensorHolder grad_tensor_holder4 = GradTensorHolder(std::move(inputs));
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 8c127efa4f7f3..7552ad83fa20f 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -76,8 +76,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta1->SetStopGradient(false);
 
-    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
-    node0_ptr->AddEdges(&res, 0);
+    node0_ptr->SetGradOutMeta({leaf_tensor}, 0);
   }
   std::vector<paddle::experimental::Tensor> outs = {target_tensor};
   // Run Backward
@@ -135,8 +134,7 @@ TEST(Backward, SingleNodeCustomGrad) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta1->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
-    node0_ptr->AddEdges(&res, 0);
+    node0_ptr->SetGradOutMeta({leaf_tensor}, 0);
   }
 
   // Run Backward
@@ -191,12 +189,12 @@ TEST(Backward, LinearNodes) {
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta->SetStopGradient(false);
     // Connect Node0 -> Node1 via Edge
-    auto meta0 = egr::AutogradMeta();
-    meta0.SetStopGradient(false);
-    meta0.SetSingleOutRankWithSlot(0, 0);
-    meta0.SetGradNode(node1_ptr);
-    std::vector<egr::AutogradMeta*> res0 = {&meta0};
-    node0_ptr->AddEdges(&res0, 0);
+    auto tmp_tensor = paddle::experimental::Tensor();
+    auto* meta0 = EagerUtils::autograd_meta(&tmp_tensor);
+    meta0->SetStopGradient(false);
+    meta0->SetSingleOutRankWithSlot(0, 0);
+    meta0->SetGradNode(node1_ptr);
+    node0_ptr->SetGradOutMeta(tmp_tensor, 0);
 
     AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
@@ -208,8 +206,7 @@ TEST(Backward, LinearNodes) {
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
     auto_grad_meta1->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
-    node1_ptr->AddEdges(&res1, 0);
+    node1_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   // Use Empty Grad Tensor
@@ -288,20 +285,20 @@ TEST(Backward, WithAccumulation) {
     auto_grad_meta1->SetStopGradient(false);
 
     // Connect Node0 -> Node2 via Edge
-    auto meta0 = egr::AutogradMeta();
-    meta0.SetStopGradient(false);
-    meta0.SetSingleOutRankWithSlot(0, 0);
-    meta0.SetGradNode(node2_ptr);
-    std::vector<egr::AutogradMeta*> res0 = {&meta0};
-    node0_ptr->AddEdges(&res0, 0);
+    auto tmp_tensor0 = paddle::experimental::Tensor();
+    auto* meta0 = EagerUtils::autograd_meta(&tmp_tensor0);
+    meta0->SetStopGradient(false);
+    meta0->SetSingleOutRankWithSlot(0, 0);
+    meta0->SetGradNode(node2_ptr);
+    node0_ptr->SetGradOutMeta(tmp_tensor0, 0);
 
     // Connect Node1 -> Node2 via Edge
-    auto meta1 = egr::AutogradMeta();
-    meta1.SetStopGradient(false);
-    meta1.SetSingleOutRankWithSlot(0, 0);
-    meta1.SetGradNode(node2_ptr);
-    std::vector<egr::AutogradMeta*> res1 = {&meta1};
-    node1_ptr->AddEdges(&res1, 0);
+    auto tmp_tensor1 = paddle::experimental::Tensor();
+    auto* meta1 = EagerUtils::autograd_meta(&tmp_tensor1);
+    meta1->SetStopGradient(false);
+    meta1->SetSingleOutRankWithSlot(0, 0);
+    meta1->SetGradNode(node2_ptr);
+    node1_ptr->SetGradOutMeta(tmp_tensor1, 0);
 
     AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
@@ -314,7 +311,7 @@ TEST(Backward, WithAccumulation) {
 
     auto_grad_meta2->SetStopGradient(false);
     std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
-    node2_ptr->AddEdges(&res2, 0);
+    node2_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   Backward(target_tensors, grad_tensors);
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 8b0759c17ed37..4337c0d092ca0 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -69,7 +69,7 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
   meta->SetSingleOutRankWithSlot(0, 0);
   meta->SetGradNode(acc_node_ptr);
   std::vector<egr::AutogradMeta*> res = {meta};
-  scale_node_ptr->AddEdges(&res, 0);
+  scale_node_ptr->SetGradOutMeta(leaf_tensor, 0);
 
   Backward(target_tensors, {});
 
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 0bd1f3bdb36aa..bcb9820419d0f 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -251,10 +251,11 @@ TEST(EagerUtils, GetGradAccumulationNode) {
 }
 
 TEST(EagerUtils, FillZeroForEmptyGradInputs) {
-  std::vector<std::vector<paddle::experimental::Tensor>> grads = {
-      std::vector<paddle::experimental::Tensor>(1)};
-  std::vector<std::vector<GradSlotMeta>> slot_metas = {
-      std::vector<GradSlotMeta>(1)};
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      grads = {std::vector<paddle::experimental::Tensor>(1)};
+  paddle::small_vector<std::vector<GradSlotMeta>, egr::kSlotSmallVectorSize>
+      slot_metas = {std::vector<GradSlotMeta>(1)};
 
   phi::DenseTensorMeta tensor_meta;
   tensor_meta.dtype = paddle::experimental::DataType::FLOAT32;
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index dc44d95daac1d..4cb316380aade 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -137,12 +137,16 @@ TEST(Forward, LinearNodes) {
 
     // 2. TensorWrapper: No TensorWrapper for ScaleNode
     // 3. NextEdges: Node 1 -> Node 0
-    const std::vector<std::vector<Edge>>& node1_edges = grad_node1->GetEdges();
-    const auto& node1_edge = node1_edges[0];
-
-    CHECK_EQ(static_cast<int>(node1_edge[0].GetEdgeRankInfo().first), 0);
-    CHECK_EQ(static_cast<int>(node1_edge[0].GetEdgeRankInfo().second), 0);
-    CHECK_EQ(node1_edge[0].GetGradNode(), grad_node0);
+    const paddle::small_vector<std::vector<GradSlotMeta>,
+                               egr::kSlotSmallVectorSize>& node1_metas =
+        grad_node1->OutputMeta();
+    const auto& node1_meta = node1_metas[0];
+
+    CHECK_EQ(static_cast<int>(node1_meta[0].GetEdge().GetEdgeRankInfo().first),
+             0);
+    CHECK_EQ(static_cast<int>(node1_meta[0].GetEdge().GetEdgeRankInfo().second),
+             0);
+    CHECK_EQ(node1_meta[0].GetEdge().GetGradNode(), grad_node0);
   }
 }
 
@@ -232,16 +236,19 @@ TEST(Forward, BranchedNodes) {
     // 2. TensorWrapper: No TensorWrapper for ScaleNode
     // 3. NextEdges
     // Node 1 -> Node 0
-    const std::vector<std::vector<Edge>>& node1_edges = grad_node1->GetEdges();
-    const Edge& node1_edge = node1_edges[0][0];
+    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+        node1_metas = grad_node1->OutputMeta();
+    const Edge& node1_edge = node1_metas[0][0].GetEdge();
 
     CHECK_EQ(static_cast<int>(node1_edge.GetEdgeRankInfo().first), 0);
     CHECK_EQ(static_cast<int>(node1_edge.GetEdgeRankInfo().second), 0);
     CHECK_EQ(node1_edge.GetGradNode(), grad_node0);
 
     // Node 2 -> Node 0
-    const std::vector<std::vector<Edge>>& node2_edges = grad_node2->GetEdges();
-    const Edge& node2_edge = node2_edges[0][0];
+    const paddle::small_vector<std::vector<egr::GradSlotMeta>,
+                               egr::kSlotSmallVectorSize>& node2_metas =
+        grad_node2->OutputMeta();
+    const Edge& node2_edge = node2_metas[0][0].GetEdge();
 
     CHECK_EQ(static_cast<int>(node2_edge.GetEdgeRankInfo().first), 0);
     CHECK_EQ(static_cast<int>(node2_edge.GetEdgeRankInfo().second), 0);
diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc
index 7e64c65d8205e..72a94b40ed753 100644
--- a/paddle/fluid/eager/tests/task_tests/grad_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc
@@ -87,7 +87,7 @@ TEST(Grad, SingleNodeEmptyGrad) {
 
     // grad_node Add Edges
     std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
-    node0_ptr->AddEdges(&res, 0);
+    node0_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
   std::vector<paddle::experimental::Tensor> outs = {output_tensor};
 
@@ -150,7 +150,7 @@ TEST(Grad, SingleNodeCustomGrad) {
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta1->SetStopGradient(false);
     std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
-    node0_ptr->AddEdges(&res, 0);
+    node0_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
@@ -207,12 +207,12 @@ TEST(Grad, LinearNodes) {
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta->SetStopGradient(false);
     // Connect Node0 -> Node1 via Edge
-    auto meta0 = egr::AutogradMeta();
-    meta0.SetStopGradient(false);
-    meta0.SetSingleOutRankWithSlot(0, 0);
-    meta0.SetGradNode(node1_ptr);
-    std::vector<egr::AutogradMeta*> res0 = {&meta0};
-    node0_ptr->AddEdges(&res0, 0);
+    auto tmp_tensor = paddle::experimental::Tensor();
+    auto* meta0 = EagerUtils::autograd_meta(&tmp_tensor);
+    meta0->SetStopGradient(false);
+    meta0->SetSingleOutRankWithSlot(0, 0);
+    meta0->SetGradNode(node1_ptr);
+    node0_ptr->SetGradOutMeta(tmp_tensor, 0);
 
     AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
@@ -224,8 +224,7 @@ TEST(Grad, LinearNodes) {
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
     auto_grad_meta1->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
-    node1_ptr->AddEdges(&res1, 0);
+    node1_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   // Use Empty Grad Tensor
@@ -304,20 +303,20 @@ TEST(Grad, WithAccumulation) {
     auto_grad_meta1->SetStopGradient(false);
 
     // Connect Node0 -> Node2 via Edge
-    auto meta0 = egr::AutogradMeta();
-    meta0.SetStopGradient(false);
-    meta0.SetSingleOutRankWithSlot(0, 0);
-    meta0.SetGradNode(node2_ptr);
-    std::vector<egr::AutogradMeta*> res0 = {&meta0};
-    node0_ptr->AddEdges(&res0, 0);
+    auto tmp_tensor0 = paddle::experimental::Tensor();
+    auto* meta0 = EagerUtils::autograd_meta(&tmp_tensor0);
+    meta0->SetStopGradient(false);
+    meta0->SetSingleOutRankWithSlot(0, 0);
+    meta0->SetGradNode(node2_ptr);
+    node0_ptr->SetGradOutMeta(tmp_tensor0, 0);
 
     // Connect Node1 -> Node2 via Edge
-    auto meta1 = egr::AutogradMeta();
-    meta1.SetStopGradient(false);
-    meta1.SetSingleOutRankWithSlot(0, 0);
-    meta1.SetGradNode(node2_ptr);
-    std::vector<egr::AutogradMeta*> res1 = {&meta1};
-    node1_ptr->AddEdges(&res1, 0);
+    auto tmp_tensor1 = paddle::experimental::Tensor();
+    auto meta1 = EagerUtils::autograd_meta(&tmp_tensor1);
+    meta1->SetStopGradient(false);
+    meta1->SetSingleOutRankWithSlot(0, 0);
+    meta1->SetGradNode(node2_ptr);
+    node1_ptr->SetGradOutMeta(tmp_tensor1, 0);
 
     AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
@@ -329,8 +328,7 @@ TEST(Grad, WithAccumulation) {
     auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
 
     auto_grad_meta2->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
-    node2_ptr->AddEdges(&res2, 0);
+    node2_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 2c53fc89f650e..855fe526c10c8 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -110,21 +110,20 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
   {
     // AccumulationNode Hook: +3
+    auto tmp_tensor0 = paddle::experimental::Tensor();
+    auto auto_grad_meta = EagerUtils::autograd_meta(&tmp_tensor0);
 
-    auto auto_grad_meta = std::make_shared<AutogradMeta>();
-
-    auto acc_node_ptr =
-        std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
+    auto acc_node_ptr = std::make_shared<GradNodeAccumulation>(auto_grad_meta);
 
     auto_grad_meta->SetStopGradient(false);
     auto_grad_meta->SetGradNode(acc_node_ptr);
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-    std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
-    scale_node_ptr->AddEdges(&res, 0);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta};
+    scale_node_ptr->SetGradOutMeta(tmp_tensor0, 0);
 
     leaf_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
-            auto_grad_meta));
+            tmp_tensor0.mutable_autograd_meta()));
 
     egr_utils_api::RegisterGradientHookForTensor(
         leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
@@ -181,19 +180,17 @@ TEST(RetainGrad, HookAfterRetainGrad) {
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
   {
     // AccumulationNode Hook: +3
-
-    auto auto_grad_meta = std::make_shared<AutogradMeta>();
-    auto acc_node_ptr =
-        std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
+    auto tmp_tensor0 = paddle::experimental::Tensor();
+    auto auto_grad_meta = EagerUtils::autograd_meta(&tmp_tensor0);
+    auto acc_node_ptr = std::make_shared<GradNodeAccumulation>(auto_grad_meta);
     auto_grad_meta->SetGradNode(acc_node_ptr);
     auto_grad_meta->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
-    scale_node_ptr->AddEdges(&res, 0);
+    scale_node_ptr->SetGradOutMeta(tmp_tensor0, 0);
 
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     leaf_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
-            auto_grad_meta));
+            tmp_tensor0.mutable_autograd_meta()));
 
     egr_utils_api::RegisterGradientHookForTensor(
         leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index 416739bbbb177..6b0a84835045c 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -69,9 +69,6 @@ inline void run_program_dygraph_function(
     grad_node->SetGradOutMeta(params, /*slot id*/ 1);
 
     grad_node->SetGradInMeta(deref_out, 0);
-    // Set Next Edges
-    grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
-    grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
 
     egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);
 
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 9347a76fd48f0..fe1cdefb7d572 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -364,12 +364,16 @@ class GradNodeRunProgram : public egr::GradNodeBase {
 
   ~GradNodeRunProgram() override = default;
   // Functor: perform backward computations
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>> &grads,  // NOLINT
-      bool create_graph) override {
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  egr::kSlotSmallVectorSize> &grads,  // NOLINT
+             bool create_graph,
+             bool is_new_grad) override {
     VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
-    std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
-        GradNodeRunProgram::ApplyGradientHooks(grads);
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        hooked_grads = GradNodeRunProgram::ApplyGradientHooks(grads);
     PADDLE_ENFORCE_EQ(hooked_grads.size(), 1,
                       paddle::platform::errors::InvalidArgument(
                           "The hooked_grads.size() of RunProgramGradOp should "
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 66d877f06e21d..033af5c496c98 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -441,8 +441,10 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
 }
 
 void EagerUtils::FillZeroForEmptyGradInputs(
-    std::vector<std::vector<paddle::experimental::Tensor>>* in_grads,
-    const std::vector<std::vector<GradSlotMeta>>& grad_in_metas) {
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>* in_grads,
+    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+        grad_in_metas) {
   for (size_t i = 0; i < in_grads->size(); i++) {
     for (size_t j = 0; j < (*in_grads)[i].size(); j++) {
       paddle::experimental::Tensor& grad = (*in_grads)[i][j];
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 51a322c8524ac..ef2b1baac661b 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -234,8 +234,10 @@ class EagerUtils {
     * Fill Zero
     * **/
   static void FillZeroForEmptyGradInputs(
-      std::vector<std::vector<paddle::experimental::Tensor>>* out_grads,
-      const std::vector<std::vector<GradSlotMeta>>& grad_out_metas);
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>* out_grads,
+      const paddle::small_vector<std::vector<GradSlotMeta>,
+                                 kSlotSmallVectorSize>& grad_out_metas);
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
index cf7a7c3c9f43d..2599e3232cac7 100644
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -18,35 +18,37 @@ namespace paddle {
 namespace framework {
 
 paddle::any GetAttrValue(const Attribute& attr) {
-  if (attr.type() == typeid(int)) {
-    return paddle::any(BOOST_GET_CONST(int, attr));
-  } else if (attr.type() == typeid(float)) {
-    return paddle::any(BOOST_GET_CONST(float, attr));
-  } else if (attr.type() == typeid(std::string)) {
-    return paddle::any(BOOST_GET_CONST(std::string, attr));
-  } else if (attr.type() == typeid(std::vector<int>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<int>, attr));
-  } else if (attr.type() == typeid(std::vector<float>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<float>, attr));
-  } else if (attr.type() == typeid(std::vector<std::string>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<std::string>, attr));
-  } else if (attr.type() == typeid(bool)) {
-    return paddle::any(BOOST_GET_CONST(bool, attr));
-  } else if (attr.type() == typeid(std::vector<bool>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<bool>, attr));
-  } else if (attr.type() == typeid(BlockDesc*)) {
-    return paddle::any(BOOST_GET_CONST(BlockDesc*, attr));
-  } else if (attr.type() == typeid(int64_t)) {
-    return paddle::any(BOOST_GET_CONST(int64_t, attr));
-  } else if (attr.type() == typeid(std::vector<BlockDesc*>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<BlockDesc*>, attr));
-  } else if (attr.type() == typeid(std::vector<int64_t>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<int64_t>, attr));
-  } else if (attr.type() == typeid(std::vector<double>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<double>, attr));
-  } else {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("Unsupported Attribute value type."));
+  switch (AttrTypeID(attr)) {
+    case proto::AttrType::INT:
+      return BOOST_GET_CONST(int, attr);
+    case proto::AttrType::FLOAT:
+      return BOOST_GET_CONST(float, attr);
+    case proto::AttrType::STRING:
+      return BOOST_GET_CONST(std::string, attr);
+    case proto::AttrType::INTS:
+      return BOOST_GET_CONST(std::vector<int>, attr);
+    case proto::AttrType::FLOATS:
+      return BOOST_GET_CONST(std::vector<float>, attr);
+    case proto::AttrType::STRINGS:
+      return BOOST_GET_CONST(std::vector<std::string>, attr);
+    case proto::AttrType::BOOLEAN:
+      return BOOST_GET_CONST(bool, attr);
+    case proto::AttrType::BOOLEANS:
+      return BOOST_GET_CONST(std::vector<bool>, attr);
+    case proto::AttrType::LONG:
+      return BOOST_GET_CONST(int64_t, attr);
+    case proto::AttrType::LONGS:
+      return BOOST_GET_CONST(std::vector<int64_t>, attr);
+    case proto::AttrType::FLOAT64S:
+      return BOOST_GET_CONST(std::vector<double>, attr);
+    case proto::AttrType::BLOCK:
+      return BOOST_GET_CONST(BlockDesc*, attr);
+    case proto::AttrType::BLOCKS:
+      return BOOST_GET_CONST(std::vector<BlockDesc*>, attr);
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported Attribute value type `%s` for phi.",
+          platform::demangle(attr.type().name())));
   }
 }
 
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 7026cc7cf1aa3..2164a21f3f892 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -203,12 +203,17 @@ struct ExtractAttribute<std::vector<double>> {
 
   const std::string& attr_name_;
 };
+
 template <typename T>
 inline proto::AttrType AttrTypeID() {
   Attribute tmp = T();
   return static_cast<proto::AttrType>(tmp.which() - 1);
 }
 
+inline proto::AttrType AttrTypeID(const Attribute& attr) {
+  return static_cast<proto::AttrType>(attr.which() - 1);
+}
+
 class AttrReader {
  public:
   explicit AttrReader(const AttributeMap& attrs)
@@ -237,7 +242,7 @@ class AttrReader {
     return *attr_value;
   }
 
-  inline const Attribute& GetAttr(const std::string& name) const {
+  const Attribute* GetAttr(const std::string& name) const {
     auto it = attrs_.find(name);
     bool found = it != attrs_.end();
     if (!found) {
@@ -246,11 +251,10 @@ class AttrReader {
         found = it != default_attrs_->end();
       }
     }
-    PADDLE_ENFORCE_EQ(found, true,
-                      platform::errors::NotFound(
-                          "Attribute (%s) should be in AttributeMap.", name));
-
-    return it->second;
+    if (found) {
+      return &it->second;
+    }
+    return nullptr;
   }
 
  private:
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 63e289af45209..99e786d3b0201 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -125,7 +125,6 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
 #ifdef PADDLE_WITH_MKLDNN
     tran_lod_tensor->set_mem_desc(in_lod_tensor.mem_desc());
 #endif
-    tran_lod_tensor->set_layout(in_lod_tensor.layout());
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<phi::SelectedRows>()) {
     auto &in_selected_rows = in_var.Get<phi::SelectedRows>();
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 75ab747794f01..fda588db4d82a 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -109,8 +109,8 @@ size_t SizeOfType(proto::VarType::Type type) {
 }
 
 // Now only supports promotion of complex type
-bool NeedPromoteTypes(const proto::VarType::Type a,
-                      const proto::VarType::Type b) {
+inline bool NeedPromoteTypes(const proto::VarType::Type& a,
+                             const proto::VarType::Type& b) {
   return (IsComplexType(a) || IsComplexType(b));
 }
 
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 124f2a86e9423..81a7f6a41bf3a 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -200,7 +200,7 @@ inline std::ostream& operator<<(std::ostream& out,
   return out;
 }
 
-extern inline bool IsComplexType(const proto::VarType::Type type) {
+extern inline bool IsComplexType(const proto::VarType::Type& type) {
   return (type == proto::VarType::COMPLEX64 ||
           type == proto::VarType::COMPLEX128);
 }
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index e1a1c1fab5ef0..895e459a37dd7 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -522,7 +522,8 @@ class HeterCpuWorker : public HogwildWorker {
 };
 #endif
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+     defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 class PSGPUWorker : public HogwildWorker {
  public:
@@ -537,8 +538,10 @@ class PSGPUWorker : public HogwildWorker {
     new (&program_) ProgramDesc(main_program);
   }
   void ProduceTasks() override;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
+#endif
   void ResetStat();
 
  protected:
@@ -588,8 +591,10 @@ class PSGPUWorker : public HogwildWorker {
   std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuEvent_t event_;
   gpuStream_t copy_stream_;
+#endif
   int batch_cnt_{0};
   std::atomic<int> done_cnt_{0};
 
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index 9c418b2f786ca..e6635a2f941cd 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -75,7 +75,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterSectionWorker);
 REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
 #endif
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+     defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index 6dc53c9649e9d..05215a9e5f14b 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -156,6 +156,9 @@ void DeleteUnusedTensors(const Scope &scope,
       for (auto &t : *lod_tensor_arr) {
         garbages.emplace_back(t.MoveMemoryHolder());
       }
+      // NOTE(wangxi): need clear the vector, otherwise lod_tensor_arr.size() is
+      // wrong, if size() decrease in next step, an error maybe occur.
+      lod_tensor_arr->clear();
     } else if (var->IsType<Strings>()) {
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/framework/fleet/heter_ps/.CMakeLists.txt.swp b/paddle/fluid/framework/fleet/heter_ps/.CMakeLists.txt.swp
deleted file mode 100644
index 7d3f69e7424d3..0000000000000
Binary files a/paddle/fluid/framework/fleet/heter_ps/.CMakeLists.txt.swp and /dev/null differ
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index c4b4064e0299e..e7601edb0ca07 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -24,7 +24,7 @@ namespace paddle {
 namespace framework {
 struct GpuPsGraphNode {
   int64_t node_id;
-  int neighbor_size, neighbor_offset;
+  unsigned int neighbor_size, neighbor_offset;
   // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
   // neighbor_size) of int64_t *neighbor_list;
 };
@@ -32,28 +32,38 @@ struct GpuPsGraphNode {
 struct GpuPsCommGraph {
   int64_t *neighbor_list;
   GpuPsGraphNode *node_list;
-  int neighbor_size, node_size;
+  unsigned int neighbor_size, node_size;
   // the size of neighbor array and graph_node_list array
   GpuPsCommGraph()
       : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
   GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
-                 int neighbor_size_, int node_size_)
+                 unsigned int neighbor_size_, unsigned int node_size_)
       : neighbor_list(neighbor_list_),
         node_list(node_list_),
         neighbor_size(neighbor_size_),
         node_size(node_size_) {}
+  void init_on_cpu(unsigned int neighbor_size, unsigned int node_size) {
+    this->neighbor_size = neighbor_size;
+    this->node_size = node_size;
+    this->neighbor_list = new int64_t[neighbor_size];
+    this->node_list = new paddle::framework::GpuPsGraphNode[node_size];
+  }
+  void release_on_cpu() {
+    delete[] neighbor_list;
+    delete[] node_list;
+  }
   void display_on_cpu() {
     VLOG(0) << "neighbor_size = " << neighbor_size;
     VLOG(0) << "node_size = " << node_size;
-    for (int i = 0; i < neighbor_size; i++) {
+    for (size_t i = 0; i < neighbor_size; i++) {
       VLOG(0) << "neighbor " << i << " " << neighbor_list[i];
     }
-    for (int i = 0; i < node_size; i++) {
+    for (size_t i = 0; i < node_size; i++) {
       VLOG(0) << "node i " << node_list[i].node_id
               << " neighbor_size = " << node_list[i].neighbor_size;
       std::string str;
       int offset = node_list[i].neighbor_offset;
-      for (int j = 0; j < node_list[i].neighbor_size; j++) {
+      for (size_t j = 0; j < node_list[i].neighbor_size; j++) {
         if (j > 0) str += ",";
         str += std::to_string(neighbor_list[j + offset]);
       }
@@ -64,11 +74,9 @@ struct GpuPsCommGraph {
 
 /*
 suppose we have a graph like this
-
 0----3-----5----7
  \   |\         |\
  17  8 9        1 2
-
 we save the nodes in arbitrary order,
 in this example,the order is
 [0,5,1,2,7,3,8,9,17]
@@ -83,7 +91,6 @@ we record each node's neighbors:
 8:3
 9:3
 17:0
-
 by concatenating each node's neighbor_list in the order we save the node id.
 we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
 this is the neighbor_list of GpuPsCommGraph
@@ -114,14 +121,46 @@ node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
 node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
 node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
 */
+struct NeighborSampleQuery {
+  int gpu_id;
+  int64_t *key;
+  int sample_size;
+  int len;
+  void initialize(int gpu_id, int64_t key, int sample_size, int len) {
+    this->gpu_id = gpu_id;
+    this->key = (int64_t *)key;
+    this->sample_size = sample_size;
+    this->len = len;
+  }
+  void display() {
+    int64_t *sample_keys = new int64_t[len];
+    VLOG(0) << "device_id " << gpu_id << " sample_size = " << sample_size;
+    VLOG(0) << "there are " << len << " keys ";
+    std::string key_str;
+    cudaMemcpy(sample_keys, key, len * sizeof(int64_t), cudaMemcpyDeviceToHost);
+
+    for (int i = 0; i < len; i++) {
+      if (key_str.size() > 0) key_str += ";";
+      key_str += std::to_string(sample_keys[i]);
+    }
+    VLOG(0) << key_str;
+    delete[] sample_keys;
+  }
+};
 struct NeighborSampleResult {
   int64_t *val;
+  int64_t *actual_val;
   int *actual_sample_size, sample_size, key_size;
+  int total_sample_size;
   std::shared_ptr<memory::Allocation> val_mem, actual_sample_size_mem;
+  std::shared_ptr<memory::Allocation> actual_val_mem;
   int64_t *get_val() { return val; }
+  int64_t get_actual_val() { return (int64_t)actual_val; }
   int *get_actual_sample_size() { return actual_sample_size; }
   int get_sample_size() { return sample_size; }
   int get_key_size() { return key_size; }
+  void set_total_sample_size(int s) { total_sample_size = s; }
+  int get_len() { return total_sample_size; }
   void initialize(int _sample_size, int _key_size, int dev_id) {
     sample_size = _sample_size;
     key_size = _key_size;
@@ -134,6 +173,41 @@ struct NeighborSampleResult {
         memory::AllocShared(place, _key_size * sizeof(int));
     actual_sample_size = (int *)actual_sample_size_mem->ptr();
   }
+  void display() {
+    VLOG(0) << "in node sample result display ------------------";
+    int64_t *res = new int64_t[sample_size * key_size];
+    cudaMemcpy(res, val, sample_size * key_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    int *ac_size = new int[key_size];
+    cudaMemcpy(ac_size, actual_sample_size, key_size * sizeof(int),
+               cudaMemcpyDeviceToHost);  // 3, 1, 3
+    int total_sample_size = 0;
+    for (int i = 0; i < key_size; i++) {
+      total_sample_size += ac_size[i];
+    }
+    int64_t *res2 = new int64_t[total_sample_size];  // r
+    cudaMemcpy(res2, actual_val, total_sample_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);  // r
+
+    int start = 0;
+    for (int i = 0; i < key_size; i++) {
+      VLOG(0) << "actual sample size for " << i << "th key is " << ac_size[i];
+      VLOG(0) << "sampled neighbors are ";
+      std::string neighbor, neighbor2;
+      for (int j = 0; j < ac_size[i]; j++) {
+        // if (neighbor.size() > 0) neighbor += ";";
+        if (neighbor2.size() > 0) neighbor2 += ";";  // r
+        // neighbor += std::to_string(res[i * sample_size + j]);
+        neighbor2 += std::to_string(res2[start + j]);  // r
+      }
+      VLOG(0) << neighbor << " " << neighbor2;
+      start += ac_size[i];  // r
+    }
+    delete[] res;
+    delete[] res2;  // r
+    delete[] ac_size;
+    VLOG(0) << " ------------------";
+  }
   NeighborSampleResult(){};
   ~NeighborSampleResult() {
     // if (val != NULL) cudaFree(val);
@@ -145,13 +219,39 @@ struct NeighborSampleResult {
 struct NodeQueryResult {
   int64_t *val;
   int actual_sample_size;
+  int64_t get_val() { return (int64_t)val; }
+  int get_len() { return actual_sample_size; }
+  std::shared_ptr<memory::Allocation> val_mem;
+  void initialize(int query_size, int dev_id) {
+    platform::CUDADeviceGuard guard(dev_id);
+    platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+    val_mem = memory::AllocShared(place, query_size * sizeof(int64_t));
+    val = (int64_t *)val_mem->ptr();
+
+    // cudaMalloc((void **)&val, query_size * sizeof(int64_t));
+    actual_sample_size = 0;
+  }
+  void display() {
+    VLOG(0) << "in node query result display ------------------";
+    int64_t *res = new int64_t[actual_sample_size];
+    cudaMemcpy(res, val, actual_sample_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+
+    VLOG(0) << "actual_sample_size =" << actual_sample_size;
+    std::string str;
+    for (int i = 0; i < actual_sample_size; i++) {
+      if (str.size() > 0) str += ";";
+      str += std::to_string(res[i]);
+    }
+    VLOG(0) << str;
+    delete[] res;
+    VLOG(0) << " ------------------";
+  }
   NodeQueryResult() {
     val = NULL;
     actual_sample_size = 0;
   };
-  ~NodeQueryResult() {
-    if (val != NULL) cudaFree(val);
-  }
+  ~NodeQueryResult() {}
 };
 }
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index ff36b38b5089f..8a0088114e2ec 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -23,13 +23,18 @@
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
+class GpuPsGraphTable : public HeterComm<int64_t, unsigned int, int> {
  public:
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware)
-      : HeterComm<int64_t, int, int>(1, resource) {
+      : HeterComm<int64_t, unsigned int, int>(1, resource) {
     load_factor_ = 0.25;
     rw_lock.reset(new pthread_rwlock_t());
     gpu_num = resource_->total_device();
+    for (int i = 0; i < gpu_num; i++) {
+      gpu_graph_list.push_back(GpuPsCommGraph());
+      sample_status.push_back(NULL);
+      tables_.push_back(NULL);
+    }
     cpu_table_status = -1;
     if (topo_aware) {
       int total_gpu = resource_->total_device();
@@ -82,14 +87,18 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
     //   end_graph_sampling();
     // }
   }
+  void build_graph_on_single_gpu(GpuPsCommGraph &g, int gpu_id);
+  void clear_graph_info(int gpu_id);
   void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
-  NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
-  NeighborSampleResult *graph_neighbor_sample(int gpu_id, int64_t *key,
-                                              int sample_size, int len);
-  NeighborSampleResult *graph_neighbor_sample_v2(int gpu_id, int64_t *key,
-                                                 int sample_size, int len,
-                                                 bool cpu_query_switch);
-  NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
+  NodeQueryResult graph_node_sample(int gpu_id, int sample_size);
+  NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
+                                                bool cpu_switch);
+  NeighborSampleResult graph_neighbor_sample(int gpu_id, int64_t *key,
+                                             int sample_size, int len);
+  NeighborSampleResult graph_neighbor_sample_v2(int gpu_id, int64_t *key,
+                                                int sample_size, int len,
+                                                bool cpu_query_switch);
+  NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
   void clear_graph_info();
   void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
                                                  int sample_size, int *h_left,
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
index b119724e695da..605019cb607fc 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <thrust/device_vector.h>
-
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <functional>
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 //#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
@@ -28,13 +30,13 @@ actual_size[0,len) is to save the sample size of each node.
 for ith node in index, actual_size[i] = min(node i's neighbor size, sample size)
 sample_result is to save the neighbor sampling result, its size is len *
 sample_size;
-
 */
 
-__global__ void get_cpu_id_index(int64_t* key, int* val, int64_t* cpu_key,
-                                 int* sum, int* index, int len) {
+__global__ void get_cpu_id_index(int64_t* key, unsigned int* val,
+                                 int64_t* cpu_key, int* sum, int* index,
+                                 int len) {
   CUDA_KERNEL_LOOP(i, len) {
-    if (val[i] == -1) {
+    if (val[i] == ((unsigned int)-1)) {
       int old = atomicAdd(sum, 1);
       cpu_key[old] = key[i];
       index[old] = i;
@@ -44,9 +46,9 @@ __global__ void get_cpu_id_index(int64_t* key, int* val, int64_t* cpu_key,
 
 template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
 __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
-                                           int* node_index, int* actual_size,
-                                           int64_t* res, int sample_len,
-                                           int n) {
+                                           unsigned int* node_index,
+                                           int* actual_size, int64_t* res,
+                                           int sample_len, int n) {
   assert(blockDim.x == WARP_SIZE);
   assert(blockDim.y == BLOCK_WARPS);
 
@@ -56,7 +58,7 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
   curand_init(blockIdx.x, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng);
 
   while (i < last_idx) {
-    if (node_index[i] == -1) {
+    if (node_index[i] == (unsigned int)(-1)) {
       actual_size[i] = 0;
       i += BLOCK_WARPS;
       continue;
@@ -93,13 +95,14 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
   }
 }
 
-__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* node_index,
+__global__ void neighbor_sample_example(GpuPsCommGraph graph,
+                                        unsigned int* node_index,
                                         int* actual_size, int64_t* res,
                                         int sample_len, int* sample_status,
                                         int n, int from) {
   int id = blockIdx.x * blockDim.y + threadIdx.y;
   if (id < n) {
-    if (node_index[id] == -1) {
+    if (node_index[id] == (unsigned int)(-1)) {
       actual_size[id] = 0;
       return;
     }
@@ -198,7 +201,6 @@ int GpuPsGraphTable::init_cpu_table(
 // }
 /*
  comment 1
-
  gpu i triggers a neighbor_sample task,
  when this task is done,
  this function is called to move the sample result on other gpu back
@@ -211,13 +213,11 @@ int GpuPsGraphTable::init_cpu_table(
  smaller than sample_size,
  is saved on src_sample_res [x*sample_size, x*sample_size +
  actual_sample_size[x])
-
  since before each gpu runs the neighbor_sample task,the key array is shuffled,
  but we have the idx array to save the original order.
  when the gpu i gets all the sample results from other gpus, it relies on
  idx array to recover the original order.
  that's what fill_dvals does.
-
 */
 
 void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
@@ -378,6 +378,18 @@ __global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
   }
 }
 
+__global__ void fill_actual_vals(int64_t* vals, int64_t* actual_vals,
+                                 int* actual_sample_size,
+                                 int* cumsum_actual_sample_size,
+                                 int sample_size, int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    for (int j = 0; j < actual_sample_size[i]; j++) {
+      actual_vals[cumsum_actual_sample_size[i] + j] = vals[sample_size * i + j];
+    }
+  }
+}
+
 __global__ void node_query_example(GpuPsCommGraph graph, int start, int size,
                                    int64_t* res) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -386,6 +398,18 @@ __global__ void node_query_example(GpuPsCommGraph graph, int start, int size,
   }
 }
 
+void GpuPsGraphTable::clear_graph_info(int gpu_id) {
+  if (tables_.size() && tables_[gpu_id] != NULL) {
+    delete tables_[gpu_id];
+  }
+  auto& graph = gpu_graph_list[gpu_id];
+  if (graph.neighbor_list != NULL) {
+    cudaFree(graph.neighbor_list);
+  }
+  if (graph.node_list != NULL) {
+    cudaFree(graph.node_list);
+  }
+}
 void GpuPsGraphTable::clear_graph_info() {
   if (tables_.size()) {
     for (auto table : tables_) delete table;
@@ -404,14 +428,52 @@ void GpuPsGraphTable::clear_graph_info() {
 /*
 the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
 it saves the graph to be saved on each gpu.
-
 for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
 == i
-
 In this function, memory is allocated on each gpu to save the graphs,
 gpu i saves the ith graph from cpu_graph_list
 */
 
+void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
+  clear_graph_info(i);
+  platform::CUDADeviceGuard guard(resource_->dev_id(i));
+  // platform::CUDADeviceGuard guard(i);
+  gpu_graph_list[i] = GpuPsCommGraph();
+  sample_status[i] = NULL;
+  tables_[i] = new Table(std::max((unsigned int)1, g.node_size) / load_factor_);
+  if (g.node_size > 0) {
+    std::vector<int64_t> keys;
+    std::vector<unsigned int> offset;
+    cudaMalloc((void**)&gpu_graph_list[i].node_list,
+               g.node_size * sizeof(GpuPsGraphNode));
+    cudaMemcpy(gpu_graph_list[i].node_list, g.node_list,
+               g.node_size * sizeof(GpuPsGraphNode), cudaMemcpyHostToDevice);
+    for (unsigned int j = 0; j < g.node_size; j++) {
+      keys.push_back(g.node_list[j].node_id);
+      offset.push_back(j);
+    }
+    build_ps(i, keys.data(), offset.data(), keys.size(), 1024, 8);
+    gpu_graph_list[i].node_size = g.node_size;
+  } else {
+    build_ps(i, NULL, NULL, 0, 1024, 8);
+    gpu_graph_list[i].node_list = NULL;
+    gpu_graph_list[i].node_size = 0;
+  }
+  if (g.neighbor_size) {
+    int* addr;
+    cudaMalloc((void**)&addr, g.neighbor_size * sizeof(int));
+    cudaMemset(addr, 0, g.neighbor_size * sizeof(int));
+    sample_status[i] = addr;
+    cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
+               g.neighbor_size * sizeof(int64_t));
+    cudaMemcpy(gpu_graph_list[i].neighbor_list, g.neighbor_list,
+               g.neighbor_size * sizeof(int64_t), cudaMemcpyHostToDevice);
+    gpu_graph_list[i].neighbor_size = g.neighbor_size;
+  } else {
+    gpu_graph_list[i].neighbor_list = NULL;
+    gpu_graph_list[i].neighbor_size = 0;
+  }
+}
 void GpuPsGraphTable::build_graph_from_cpu(
     std::vector<GpuPsCommGraph>& cpu_graph_list) {
   VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = "
@@ -424,20 +486,21 @@ void GpuPsGraphTable::build_graph_from_cpu(
   for (int i = 0; i < cpu_graph_list.size(); i++) {
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     // platform::CUDADeviceGuard guard(i);
-    gpu_graph_list.push_back(GpuPsCommGraph());
-    sample_status.push_back(NULL);
-    auto table =
-        new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_);
-    tables_.push_back(table);
+    gpu_graph_list[i] = GpuPsCommGraph();
+    sample_status[i] = NULL;
+    // auto table =
+    //     new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_);
+    tables_[i] = new Table(
+        std::max((unsigned int)1, cpu_graph_list[i].node_size) / load_factor_);
     if (cpu_graph_list[i].node_size > 0) {
       std::vector<int64_t> keys;
-      std::vector<int> offset;
+      std::vector<unsigned int> offset;
       cudaMalloc((void**)&gpu_graph_list[i].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
       cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode),
                  cudaMemcpyHostToDevice);
-      for (int j = 0; j < cpu_graph_list[i].node_size; j++) {
+      for (unsigned int j = 0; j < cpu_graph_list[i].node_size; j++) {
         keys.push_back(cpu_graph_list[i].node_list[j].node_id);
         offset.push_back(j);
       }
@@ -468,10 +531,15 @@ void GpuPsGraphTable::build_graph_from_cpu(
   cudaDeviceSynchronize();
 }
 
-NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
-                                                             int64_t* key,
-                                                             int sample_size,
-                                                             int len) {
+NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v3(
+    NeighborSampleQuery q, bool cpu_switch) {
+  return graph_neighbor_sample_v2(q.gpu_id, q.key, q.sample_size, q.len,
+                                  cpu_switch);
+}
+NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
+                                                            int64_t* key,
+                                                            int sample_size,
+                                                            int len) {
   /*
  comment 2
   this function shares some kernels with heter_comm_inl.h
@@ -479,7 +547,6 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
   gpu_id:the id of gpu.
   len:how many keys are used,(the length of array key)
   sample_size:how many neighbors should be sampled for each node in key.
-
   the code below shuffle the key array to make the keys
     that belong to a gpu-card stay together,
     the shuffled result is saved on d_shard_keys,
@@ -489,18 +556,16 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] =
  b,
     if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1
-
     for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2
     when we run this neighbor_sample function,
     the key is shuffled to [0,2,4,6,8,1,3,5,7]
     the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0,
     the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1,
     h_left = [0,5],h_right = [4,8]
-
   */
 
-  NeighborSampleResult* result = new NeighborSampleResult();
-  result->initialize(sample_size, len, resource_->dev_id(gpu_id));
+  NeighborSampleResult result;
+  result.initialize(sample_size, len, resource_->dev_id(gpu_id));
   if (len == 0) {
     return result;
   }
@@ -508,8 +573,8 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
   platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
   // cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
   // cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
-  int* actual_sample_size = result->actual_sample_size;
-  int64_t* val = result->val;
+  int* actual_sample_size = result.actual_sample_size;
+  int64_t* val = result.val;
   int total_gpu = resource_->total_device();
   // int dev_id = resource_->dev_id(gpu_id);
   auto stream = resource_->local_stream(gpu_id, 0);
@@ -601,15 +666,15 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     // use the key-value map to update alloc_mem_i[0,shard_len)
     // tables_[i]->rwlock_->RDLock();
     tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
-                    reinterpret_cast<int*>(node.val_storage),
+                    reinterpret_cast<unsigned int*>(node.val_storage),
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
     // node.in_stream);
     int shard_len = h_right[i] - h_left[i] + 1;
     auto graph = gpu_graph_list[i];
-    int* id_array = reinterpret_cast<int*>(node.val_storage);
-    int* actual_size_array = id_array + shard_len;
-    int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
+    unsigned int* id_array = reinterpret_cast<unsigned int*>(node.val_storage);
+    int* actual_size_array = (int*)(id_array + shard_len);
+    int64_t* sample_array = (int64_t*)(actual_size_array + shard_len);
     int sample_grid_size = (shard_len - 1) / dim_y + 1;
     dim3 block(parallel_sample_size, dim_y);
     dim3 grid(sample_grid_size);
@@ -686,10 +751,10 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
   return result;
 }
 
-NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample_v2(
+NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     int gpu_id, int64_t* key, int sample_size, int len, bool cpu_query_switch) {
-  NeighborSampleResult* result = new NeighborSampleResult();
-  result->initialize(sample_size, len, resource_->dev_id(gpu_id));
+  NeighborSampleResult result;
+  result.initialize(sample_size, len, resource_->dev_id(gpu_id));
 
   if (len == 0) {
     return result;
@@ -697,8 +762,8 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample_v2(
 
   platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
   platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-  int* actual_sample_size = result->actual_sample_size;
-  int64_t* val = result->val;
+  int* actual_sample_size = result.actual_sample_size;
+  int64_t* val = result.val;
   int total_gpu = resource_->total_device();
   auto stream = resource_->local_stream(gpu_id, 0);
 
@@ -742,6 +807,8 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample_v2(
     if (shard_len == 0) {
       continue;
     }
+    // create_storage(gpu_id, i, shard_len * sizeof(int64_t),
+    //                shard_len * (1 + sample_size) * sizeof(int64_t));
     create_storage(gpu_id, i, shard_len * sizeof(int64_t),
                    shard_len * (1 + sample_size) * sizeof(int64_t));
   }
@@ -764,15 +831,18 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample_v2(
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     // If not found, val is -1.
     tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
-                    reinterpret_cast<int*>(node.val_storage),
+                    reinterpret_cast<unsigned int*>(node.val_storage),
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
 
     auto shard_len = h_right[i] - h_left[i] + 1;
     auto graph = gpu_graph_list[i];
-    int* id_array = reinterpret_cast<int*>(node.val_storage);
-    int* actual_size_array = id_array + shard_len;
-    int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
+    // int* id_array = reinterpret_cast<int*>(node.val_storage);
+    // int* actual_size_array = id_array + shard_len;
+    // int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
+    unsigned int* id_array = reinterpret_cast<unsigned int*>(node.val_storage);
+    int* actual_size_array = (int*)(id_array + shard_len);
+    int64_t* sample_array = (int64_t*)(actual_size_array + shard_len);
     constexpr int WARP_SIZE = 32;
     constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
     constexpr int TILE_SIZE = BLOCK_WARPS * 16;
@@ -850,6 +920,28 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample_v2(
   fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
       d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
       d_idx_ptr, sample_size, len);
+
+  {
+    platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
+    platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
+    thrust::device_ptr<int> t_actual_sample_size(actual_sample_size);
+    int total_sample_size =
+        thrust::reduce(t_actual_sample_size, t_actual_sample_size + len);
+    result.actual_val_mem =
+        memory::AllocShared(place, total_sample_size * sizeof(int64_t));
+    result.actual_val = (int64_t*)(result.actual_val_mem)->ptr();
+
+    result.set_total_sample_size(total_sample_size);
+
+    thrust::device_vector<int> cumsum_actual_sample_size(len);
+    thrust::exclusive_scan(t_actual_sample_size, t_actual_sample_size + len,
+                           cumsum_actual_sample_size.begin(), 0);
+    fill_actual_vals<<<grid_size, block_size_, 0, stream>>>(
+        val, result.actual_val, actual_sample_size,
+        thrust::raw_pointer_cast(cumsum_actual_sample_size.data()), sample_size,
+        len);
+  }
+
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
@@ -861,29 +953,27 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample_v2(
   return result;
 }
 
-NodeQueryResult* GpuPsGraphTable::graph_node_sample(int gpu_id,
-                                                    int sample_size) {}
+NodeQueryResult GpuPsGraphTable::graph_node_sample(int gpu_id,
+                                                   int sample_size) {
+  return NodeQueryResult();
+}
 
-NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
-                                                  int query_size) {
-  NodeQueryResult* result = new NodeQueryResult();
+NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
+                                                 int query_size) {
+  NodeQueryResult result;
   if (query_size <= 0) return result;
-  int& actual_size = result->actual_sample_size;
+  int& actual_size = result.actual_sample_size;
   actual_size = 0;
-  cudaMalloc((void**)&result->val, query_size * sizeof(int64_t));
-  int64_t* val = result->val;
   // int dev_id = resource_->dev_id(gpu_id);
   // platform::CUDADeviceGuard guard(dev_id);
-  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-  std::vector<int> idx, gpu_begin_pos, local_begin_pos, sample_size;
-  int size = 0;
+  std::vector<int> idx, gpu_begin_pos, local_begin_pos;
+  int sample_size;
   /*
   if idx[i] = a, gpu_begin_pos[i] = p1,
   gpu_local_begin_pos[i] = p2;
   sample_size[i] = s;
   then on gpu a, the nodes of positions [p1,p1 + s) should be returned
   and saved from the p2 position on the sample_result array
-
   for example:
   suppose
   gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7]
@@ -893,23 +983,54 @@ NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
   gpu_begin_pos = [3,0]
   local_begin_pos = [0,3]
   sample_size = [2,3]
-
   */
+  std::function<int(int, int, int, int, int&, int&)> range_check = [](
+      int x, int y, int x1, int y1, int& x2, int& y2) {
+    if (y <= x1 || x >= y1) return 0;
+    y2 = min(y, y1);
+    x2 = max(x1, x);
+    return y2 - x2;
+  };
+  auto graph = gpu_graph_list[gpu_id];
+  if (graph.node_size == 0) {
+    return result;
+  }
+  int x2, y2;
+  int len = range_check(start, start + query_size, 0, graph.node_size, x2, y2);
+
+  if (len == 0) {
+    return result;
+  }
+  int64_t* val;
+  sample_size = len;
+  result.initialize(len, resource_->dev_id(gpu_id));
+  actual_size = len;
+  val = result.val;
+  int dev_id_i = resource_->dev_id(gpu_id);
+  platform::CUDADeviceGuard guard(dev_id_i);
+  // platform::CUDADeviceGuard guard(i);
+  int grid_size = (len - 1) / block_size_ + 1;
+  node_query_example<<<grid_size, block_size_, 0,
+                       resource_->remote_stream(gpu_id, gpu_id)>>>(
+      gpu_graph_list[gpu_id], x2, len, (int64_t*)val);
+  cudaStreamSynchronize(resource_->remote_stream(gpu_id, gpu_id));
+  return result;
+  /*
   for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) {
     auto graph = gpu_graph_list[i];
     if (graph.node_size == 0) {
       continue;
     }
-    if (graph.node_size + size > start) {
-      int cur_size = min(query_size, graph.node_size + size - start);
-      query_size -= cur_size;
-      idx.emplace_back(i);
-      gpu_begin_pos.emplace_back(start - size);
+    int x2, y2;
+    int len = range_check(start, start + query_size, size,
+                          size + graph.node_size, x2, y2);
+    if (len > 0) {
+      idx.push_back(i);
+      gpu_begin_pos.emplace_back(x2 - size);
       local_begin_pos.emplace_back(actual_size);
-      start += cur_size;
-      actual_size += cur_size;
-      sample_size.emplace_back(cur_size);
-      create_storage(gpu_id, i, 1, cur_size * sizeof(int64_t));
+      sample_size.push_back(len);
+      actual_size += len;
+      create_storage(gpu_id, i, 1, len * sizeof(int64_t));
     }
     size += graph.node_size;
   }
@@ -936,7 +1057,11 @@ NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
     auto& node = path_[gpu_id][idx[i]].nodes_.front();
     cudaStreamSynchronize(node.out_stream);
   }
+  for (auto x : idx) {
+    destroy_storage(gpu_id, x);
+  }
   return result;
+  */
 }
 }
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index 2f099d09397d5..93854d7f1ec3f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -58,6 +58,11 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
     device_id_mapping.push_back(device_id);
   }
 }
+std::vector<std::vector<int64_t>> GraphGpuWrapper::get_all_id(int type, int idx,
+                                                              int slice_num) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->get_all_id(type, idx, slice_num);
+}
 void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
                                    std::vector<std::string> &node_types) {
   id_to_edge = edge_types;
@@ -76,6 +81,32 @@ void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
   this->table_feat_conf_feat_shape.resize(node_types.size());
 }
 
+void GraphGpuWrapper::make_partitions(int idx, int64_t byte_size,
+                                      int device_len) {
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->make_partitions(idx, byte_size, device_len);
+}
+int32_t GraphGpuWrapper::load_next_partition(int idx) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->load_next_partition(idx);
+}
+
+void GraphGpuWrapper::set_search_level(int level) {
+  ((GpuPsGraphTable *)graph_table)->cpu_graph_table->set_search_level(level);
+}
+
+std::vector<int64_t> GraphGpuWrapper::get_partition(int idx, int num) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->get_partition(idx, num);
+}
+int32_t GraphGpuWrapper::get_partition_num(int idx) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->get_partition_num(idx);
+}
+void GraphGpuWrapper::make_complementary_graph(int idx, int64_t byte_size) {
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->make_complementary_graph(idx, byte_size);
+}
 void GraphGpuWrapper::load_edge_file(std::string name, std::string filepath,
                                      bool reverse) {
   // 'e' means load edge
@@ -132,10 +163,11 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
   }
   VLOG(0) << "add conf over";
 }
+void GraphGpuWrapper::init_search_level(int level) { search_level = level; }
 
 void GraphGpuWrapper::init_service() {
   table_proto.set_task_pool_size(24);
-
+  table_proto.set_search_level(search_level);
   table_proto.set_table_name("cpu_graph_table");
   table_proto.set_use_cache(false);
   for (int i = 0; i < id_to_edge.size(); i++)
@@ -158,14 +190,21 @@ void GraphGpuWrapper::init_service() {
   graph_table = (char *)g;
 }
 
-void GraphGpuWrapper::upload_batch(std::vector<std::vector<int64_t>> &ids) {
+void GraphGpuWrapper::upload_batch(int idx,
+                                   std::vector<std::vector<int64_t>> &ids) {
   GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
-  std::vector<paddle::framework::GpuPsCommGraph> vec;
+  // std::vector<paddle::framework::GpuPsCommGraph> vec;
   for (int i = 0; i < ids.size(); i++) {
-    vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(0, ids[i]));
+    // vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]));
+    GpuPsCommGraph sub_graph =
+        g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]);
+    g->build_graph_on_single_gpu(sub_graph, i);
+    sub_graph.release_on_cpu();
+    VLOG(0) << "sub graph on gpu " << i << " is built";
   }
-  g->build_graph_from_cpu(vec);
+  // g->build_graph_from_cpu(vec);
 }
+
 void GraphGpuWrapper::initialize() {
   std::vector<int> device_id_mapping;
   for (int i = 0; i < 2; i++) device_id_mapping.push_back(i);
@@ -238,10 +277,10 @@ void GraphGpuWrapper::test() {
       ((GpuPsGraphTable *)graph_table)
           ->graph_neighbor_sample(0, (int64_t *)key, 2, 3);
   int64_t *res = new int64_t[7];
-  cudaMemcpy(res, neighbor_sample_res->val, 3 * 2 * sizeof(int64_t),
+  cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t),
              cudaMemcpyDeviceToHost);
   int *actual_sample_size = new int[3];
-  cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size,
+  cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
              3 * sizeof(int),
              cudaMemcpyDeviceToHost);  // 3, 1, 3
 
@@ -256,12 +295,60 @@ void GraphGpuWrapper::test() {
     }
   }
 }
-NeighborSampleResult *GraphGpuWrapper::graph_neighbor_sample(int gpu_id,
-                                                             int64_t *key,
-                                                             int sample_size,
-                                                             int len) {
+NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3(
+    NeighborSampleQuery q, bool cpu_switch) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->graph_neighbor_sample_v3(q, cpu_switch);
+}
+
+// this function is contributed by Liwb5
+std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
+    int gpu_id, std::vector<int64_t> &key, int sample_size) {
+  int64_t *cuda_key;
+  platform::CUDADeviceGuard guard(gpu_id);
+
+  cudaMalloc(&cuda_key, key.size() * sizeof(int64_t));
+  cudaMemcpy(cuda_key, key.data(), key.size() * sizeof(int64_t),
+             cudaMemcpyHostToDevice);
+
+  auto neighbor_sample_res =
+      ((GpuPsGraphTable *)graph_table)
+          ->graph_neighbor_sample(gpu_id, cuda_key, sample_size, key.size());
+
+  int *actual_sample_size = new int[key.size()];
+  cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
+             key.size() * sizeof(int),
+             cudaMemcpyDeviceToHost);  // 3, 1, 3
+  int cumsum = 0;
+  for (int i = 0; i < key.size(); i++) {
+    cumsum += actual_sample_size[i];
+  }
+  /* VLOG(0) << "cumsum " << cumsum; */
+
+  std::vector<int64_t> cpu_key, res;
+  cpu_key.resize(key.size() * sample_size);
+
+  cudaMemcpy(cpu_key.data(), neighbor_sample_res.val,
+             key.size() * sample_size * sizeof(int64_t),
+             cudaMemcpyDeviceToHost);
+  for (int i = 0; i < key.size(); i++) {
+    for (int j = 0; j < actual_sample_size[i]; j++) {
+      res.push_back(key[i]);
+      res.push_back(cpu_key[i * sample_size + j]);
+    }
+  }
+  /* for(int i = 0;i < res.size();i ++) { */
+  /*     VLOG(0) << i << " " << res[i]; */
+  /* } */
+
+  cudaFree(cuda_key);
+  return res;
+}
+
+NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id, int start,
+                                                 int query_size) {
   return ((GpuPsGraphTable *)graph_table)
-      ->graph_neighbor_sample(gpu_id, key, sample_size, len);
+      ->query_node_list(gpu_id, start, query_size);
 }
 #endif
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index 26ce4c8adce21..b638311304773 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -22,20 +22,37 @@ namespace framework {
 #ifdef PADDLE_WITH_HETERPS
 class GraphGpuWrapper {
  public:
-  char* graph_table;
+  static GraphGpuWrapper* GetInstance() {
+    static GraphGpuWrapper wrapper;
+    return &wrapper;
+  }
   void initialize();
   void test();
   void set_device(std::vector<int> ids);
   void init_service();
   void set_up_types(std::vector<std::string>& edge_type,
                     std::vector<std::string>& node_type);
-  void upload_batch(std::vector<std::vector<int64_t>>& ids);
+  void upload_batch(int idx, std::vector<std::vector<int64_t>>& ids);
   void add_table_feat_conf(std::string table_name, std::string feat_name,
                            std::string feat_dtype, int feat_shape);
   void load_edge_file(std::string name, std::string filepath, bool reverse);
   void load_node_file(std::string name, std::string filepath);
-  NeighborSampleResult* graph_neighbor_sample(int gpu_id, int64_t* key,
-                                              int sample_size, int len);
+  int32_t load_next_partition(int idx);
+  int32_t get_partition_num(int idx);
+  std::vector<int64_t> get_partition(int idx, int num);
+  void make_partitions(int idx, int64_t byte_size, int device_len);
+  void make_complementary_graph(int idx, int64_t byte_size);
+  void set_search_level(int level);
+  void init_search_level(int level);
+  std::vector<std::vector<int64_t>> get_all_id(int type, int idx,
+                                               int slice_num);
+  NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
+  NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
+                                                bool cpu_switch);
+  std::vector<int64_t> graph_neighbor_sample(int gpu_id,
+                                             std::vector<int64_t>& key,
+                                             int sample_size);
+
   std::unordered_map<std::string, int> edge_to_id, feature_to_id;
   std::vector<std::string> id_to_feature, id_to_edge;
   std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
@@ -44,6 +61,8 @@ class GraphGpuWrapper {
   std::vector<std::vector<int>> table_feat_conf_feat_shape;
   ::paddle::distributed::GraphParameter table_proto;
   std::vector<int> device_id_mapping;
+  int search_level = 1;
+  char* graph_table;
 };
 #endif
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index fc54be447fe17..87b62c6d380a4 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -298,6 +298,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
 
 template class HashTable<unsigned long, paddle::framework::FeatureValue>;
 template class HashTable<long, int>;
+template class HashTable<long, unsigned long>;
+template class HashTable<long, unsigned int>;
 
 template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
     cudaStream_t>(const unsigned long* d_keys,
@@ -308,6 +310,10 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
                                                       int* d_vals, size_t len,
                                                       cudaStream_t stream);
 
+template void HashTable<long, unsigned long>::get<cudaStream_t>(
+    const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
+template void HashTable<long, unsigned int>::get<cudaStream_t>(
+    const long* d_keys, unsigned int* d_vals, size_t len, cudaStream_t stream);
 // template void
 // HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
@@ -323,6 +329,14 @@ template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
                                                          size_t len,
                                                          cudaStream_t stream);
 
+template void HashTable<long, unsigned long>::insert<cudaStream_t>(
+    const long* d_keys, const unsigned long* d_vals, size_t len,
+    cudaStream_t stream);
+
+template void HashTable<long, unsigned int>::insert<cudaStream_t>(
+    const long* d_keys, const unsigned int* d_vals, size_t len,
+    cudaStream_t stream);
+
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::insert<
 //    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 51432e9de81fb..7ebf7660ee521 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -584,7 +584,7 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
 
   for (int i = 0; i < total_device; ++i) {
     int shard_len = h_right[i] - h_left[i] + 1;
-    if (shard_len == 0) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
     create_storage(num, i, shard_len * sizeof(KeyType),
@@ -630,6 +630,9 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
   sync_stream(stream);
 
   for (int i = 0; i < total_device; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
     destroy_storage(num, i);
   }
 }
@@ -747,6 +750,9 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   }
 
   for (int i = 0; i < total_device; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
     destroy_storage(dev_num, i);
   }
 }
@@ -862,6 +868,9 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   }
 
   for (int i = 0; i < total_device; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
     destroy_storage(dev_num, i);
   }
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
index 2e94a7f4059ab..b3a38a6dfde49 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -28,6 +28,16 @@ namespace platform = paddle::platform;
 // paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
 //     std::vector<int64_t> ids)
 
+std::string edges[] = {
+    std::string("0\t1"), std::string("0\t9"), std::string("1\t2"),
+    std::string("1\t0"), std::string("2\t1"), std::string("2\t3"),
+    std::string("3\t2"), std::string("3\t4"), std::string("4\t3"),
+    std::string("4\t5"), std::string("5\t4"), std::string("5\t6"),
+    std::string("6\t5"), std::string("6\t7"), std::string("7\t6"),
+    std::string("7\t8"),
+};
+char edge_file_name[] = "edges1.txt";
+
 std::string nodes[] = {
     std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
     std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
@@ -53,12 +63,17 @@ std::vector<std::string> user_feature_dtype = {"float32", "int32", "string",
 std::vector<std::string> item_feature_dtype = {"float32"};
 std::vector<int> user_feature_shape = {1, 2, 1, 1};
 std::vector<int> item_feature_shape = {1};
-void prepare_file(char file_name[]) {
+void prepare_file(char file_name[], bool load_edge) {
   std::ofstream ofile;
   ofile.open(file_name);
-
-  for (auto x : nodes) {
-    ofile << x << std::endl;
+  if (load_edge) {
+    for (auto x : edges) {
+      ofile << x << std::endl;
+    }
+  } else {
+    for (auto x : nodes) {
+      ofile << x << std::endl;
+    }
   }
   ofile.close();
 }
@@ -85,9 +100,10 @@ TEST(TEST_FLEET, test_cpu_cache) {
     g_f1->add_dtype(item_feature_dtype[i]);
     g_f1->add_shape(item_feature_shape[i]);
   }
-  prepare_file(node_file_name);
+  prepare_file(node_file_name, false);
+  prepare_file(edge_file_name, true);
   table_proto.set_shard_num(24);
-
+  table_proto.set_search_level(2);
   std::shared_ptr<HeterPsResource> resource =
       std::make_shared<HeterPsResource>(device_id_mapping);
   resource->enable_p2p();
@@ -120,11 +136,14 @@ TEST(TEST_FLEET, test_cpu_cache) {
   }
   g.cpu_graph_table->build_sampler(0);
   ids1.push_back(5);
+  ids1.push_back(7);
   vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids0));
   vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids1));
   vec[0].display_on_cpu();
   vec[1].display_on_cpu();
-  g.build_graph_from_cpu(vec);
+  // g.build_graph_from_cpu(vec);
+  g.build_graph_on_single_gpu(vec[0], 0);
+  g.build_graph_on_single_gpu(vec[1], 1);
   int64_t cpu_key[3] = {0, 1, 2};
   /*
   std::vector<std::shared_ptr<char>> buffers(3);
@@ -136,26 +155,84 @@ TEST(TEST_FLEET, test_cpu_cache) {
   }
   */
   void *key;
-  platform::CUDADeviceGuard guard(0);
-  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
-  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
-  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 2, 3);
-  int64_t *res = new int64_t[7];
-  cudaMemcpy(res, neighbor_sample_res->val, 3 * 2 * sizeof(int64_t),
-             cudaMemcpyDeviceToHost);
-  int *actual_sample_size = new int[3];
-  cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size,
-             3 * sizeof(int),
-             cudaMemcpyDeviceToHost);  // 3, 1, 3
-
-  //{0,9} or {9,0} is expected for key 0
-  //{0,2} or {2,0} is expected for key 1
-  //{1,3} or {3,1} is expected for key 2
-  for (int i = 0; i < 3; i++) {
-    VLOG(0) << "actual sample size for " << i << " is "
-            << actual_sample_size[i];
-    for (int j = 0; j < actual_sample_size[i]; j++) {
-      VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 + j];
+  int device_len = 2;
+  for (int i = 0; i < 2; i++) {
+    // platform::CUDADeviceGuard guard(i);
+    LOG(0) << "query on card " << i;
+    //{1,9} or {9,1} is expected for key 0
+    //{0,2} or {2,0} is expected for key 1
+    //{1,3} or {3,1} is expected for key 2
+    int step = 2;
+    int cur = 0;
+    while (true) {
+      auto node_query_res = g.query_node_list(i, cur, step);
+      node_query_res.display();
+      if (node_query_res.get_len() == 0) {
+        VLOG(0) << "no more ids,break";
+        break;
+      }
+      cur += node_query_res.get_len();
+      NeighborSampleQuery query;
+      query.initialize(i, node_query_res.get_val(), 1,
+                       node_query_res.get_len());
+      query.display();
+      auto c = g.graph_neighbor_sample_v3(query, false);
+      c.display();
+    }
+  }
+  g.cpu_graph_table->set_search_level(2);
+  // g.cpu_graph_table->Load_to_ssd(edge_file_name,"e>u2u");
+  g.cpu_graph_table->Load(edge_file_name, "e>u2u");
+  g.cpu_graph_table->make_partitions(0, 64, 2);
+  int index = 0;
+  while (g.cpu_graph_table->load_next_partition(0) != -1) {
+    auto all_ids = g.cpu_graph_table->get_all_id(0, 0, device_len);
+    for (auto x : all_ids) {
+      for (auto y : x) {
+        VLOG(0) << "part " << index << " " << y;
+      }
+    }
+    for (int i = 0; i < all_ids.size(); i++) {
+      GpuPsCommGraph sub_graph =
+          g.cpu_graph_table->make_gpu_ps_graph(0, all_ids[i]);
+      g.build_graph_on_single_gpu(sub_graph, i);
+      VLOG(2) << "sub graph on gpu " << i << " is built";
+    }
+    VLOG(0) << "start to iterate gpu graph node";
+    g.cpu_graph_table->make_complementary_graph(0, 64);
+    for (int i = 0; i < 2; i++) {
+      // platform::CUDADeviceGuard guard(i);
+      LOG(0) << "query on card " << i;
+      int step = 2;
+      int cur = 0;
+      while (true) {
+        auto node_query_res = g.query_node_list(i, cur, step);
+        node_query_res.display();
+        if (node_query_res.get_len() == 0) {
+          VLOG(0) << "no more ids,break";
+          break;
+        }
+        cur += node_query_res.get_len();
+        NeighborSampleQuery query, q1;
+        query.initialize(i, node_query_res.get_val(), 4,
+                         node_query_res.get_len());
+        query.display();
+        auto c = g.graph_neighbor_sample_v3(query, true);
+        c.display();
+        platform::CUDADeviceGuard guard(i);
+        int64_t *key;
+        VLOG(0) << "sample key 1 globally";
+        g.cpu_graph_table->set_search_level(2);
+        cudaMalloc((void **)&key, sizeof(int64_t));
+        int64_t t_key = 1;
+        cudaMemcpy(key, &t_key, sizeof(int64_t), cudaMemcpyHostToDevice);
+        q1.initialize(i, (int64_t)key, 2, 1);
+        auto d = g.graph_neighbor_sample_v3(q1, true);
+        d.display();
+        cudaFree(key);
+        g.cpu_graph_table->set_search_level(1);
+      }
     }
+    index++;
   }
 }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 52bfe42cc5028..64765c98fd04b 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -630,7 +630,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 #endif
 
 #ifdef PADDLE_WITH_PSCORE
-    auto& task_ptrs = device_task_ptrs[dev];
+    auto& task_ptrs = device_task_ptrs[shard_id];
 #endif
 
     int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id];
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index d7a2a42ca7dc7..2a8ffbf431ecd 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/op_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/tensor_utils.h"
 
 namespace paddle {
@@ -51,8 +52,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   paddle::any Attr(const std::string& name) const override {
-    auto& attr = ctx_.Attrs().GetAttr(name);
-    return GetAttrValue(attr);
+    auto* attr = ctx_.Attrs().GetAttr(name);
+    PADDLE_ENFORCE_NOT_NULL(
+        attr, platform::errors::NotFound(
+                  "Attribute (%s) should be in AttributeMap.", name));
+    return GetAttrValue(*attr);
   }
 
   size_t InputSize(const std::string& name) const override {
@@ -69,6 +73,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   bool IsDenseTensorInput(const std::string& name) const override {
+    auto var_type = ctx_.GetInputVarType(name);
+    return var_type == proto::VarType::LOD_TENSOR;
+  }
+
+  bool IsDenseTensorInputs(const std::string& name) const override {
     auto var_types = ctx_.GetInputsVarType(name);
     return std::all_of(var_types.begin(), var_types.end(),
                        [](const proto::VarType::Type& type) {
@@ -77,11 +86,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   bool IsSelectedRowsInput(const std::string& name) const override {
-    auto var_types = ctx_.GetInputsVarType(name);
-    return std::all_of(var_types.begin(), var_types.end(),
-                       [](const proto::VarType::Type& type) {
-                         return type == proto::VarType::SELECTED_ROWS;
-                       });
+    auto var_type = ctx_.GetInputVarType(name);
+    return var_type == proto::VarType::SELECTED_ROWS;
   }
 
   bool IsDenseTensorVectorInput(const std::string& name) const override {
@@ -320,7 +326,7 @@ void CompatInferMetaContext::EmplaceBackOutput(CompatMetaTensor output) {
 }
 
 void CompatInferMetaContext::EmplaceBackInputs(
-    paddle::SmallVector<CompatMetaTensor, phi::kInputSmallVectorSize> inputs) {
+    paddle::small_vector<CompatMetaTensor, phi::kInputSmallVectorSize> inputs) {
   int index = compat_inputs_.size();
   input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
   compat_inputs_.insert(compat_inputs_.end(),
@@ -329,7 +335,7 @@ void CompatInferMetaContext::EmplaceBackInputs(
 }
 
 void CompatInferMetaContext::EmplaceBackOutputs(
-    paddle::SmallVector<CompatMetaTensor, phi::kOutputSmallVectorSize>
+    paddle::small_vector<CompatMetaTensor, phi::kOutputSmallVectorSize>
         outputs) {
   int index = compat_outputs_.size();
   output_range_.emplace_back(
@@ -428,7 +434,7 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
         infer_meta_context.EmplaceBackInput(
             std::move(CompatMetaTensor(input_var[0], ctx->IsRuntime())));
       } else {
-        paddle::SmallVector<CompatMetaTensor, phi::kInputSmallVectorSize>
+        paddle::small_vector<CompatMetaTensor, phi::kInputSmallVectorSize>
             inputs;
         for (const auto& in : input_var) {
           inputs.emplace_back(
@@ -447,236 +453,252 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
   auto attr_reader = ctx->Attrs();
   for (size_t i = 0; i < attr_names.size(); ++i) {
     auto& attr_name = attr_names[i];
-    if (attr_defs[i].type_index == std::type_index(typeid(phi::IntArray))) {
-      // When attr is a vector_tensor or tensor, transform it to IntArray
-      if (ctx->HasInputs(attr_name) || ctx->HasInput(attr_name)) {
-        auto infershape_inputs = std::move(ctx->GetInputVarPtrs(attr_name));
-        if (ctx->IsRuntime()) {
-          // If is in runtime, we will get tensor's value for IntArray
-          // and push it into attrs
-          std::vector<Variable*> vars;
-          vars.reserve(infershape_inputs.size());
-          for (size_t i = 0; i < infershape_inputs.size(); i++) {
-            vars.push_back(BOOST_GET_CONST(Variable*, infershape_inputs[i]));
+    VLOG(6) << "BuildInferMetaContext: " << attr_name << ": "
+            << attr_defs[i].type_index;
+    auto* attr_ptr = attr_reader.GetAttr(attr_name);
+    switch (attr_defs[i].type_index) {
+      case phi::AttributeType::SCALAR:
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::FLOAT:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::Scalar(BOOST_GET_CONST(float, attr)));
+              break;
+            case framework::proto::AttrType::INT:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::Scalar(BOOST_GET_CONST(int, attr)));
+              break;
+            case framework::proto::AttrType::STRING:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::Scalar(BOOST_GET_CONST(std::string, attr)));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to Scalar when construct "
+                  "InferMetaContext.",
+                  attr_name));
           }
-          if (infershape_inputs.size() != 1) {
-            infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePhiIntArrayFromVarList(vars)));
+        } else if (ctx->HasInput(attr_name)) {
+          auto infershape_input = std::move(ctx->GetInputVarPtrs(attr_name));
+          if (infershape_input.size() == 1) {
+            if (ctx->IsRuntime()) {
+              Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
+              infer_meta_context.EmplaceBackAttr(
+                  std::move(experimental::MakePhiScalarFromVar(*var)));
+            } else {
+              phi::Scalar tensor_scalar(-1);
+              tensor_scalar.SetFromTensor(true);
+              infer_meta_context.EmplaceBackAttr(std::move(tensor_scalar));
+            }
           } else {
-            infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePhiIntArrayFromVar(*vars[0])));
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "Invalid input.size() when cast op attribute `%s` to Scalar, "
+                "expected 1, but actually is %d .",
+                attr_name, infershape_input.size()));
           }
         } else {
-          // If is not in runtime, we will set default value(-1) for IntArray
-          std::vector<VarDesc*> vars;
-          vars.reserve(infershape_inputs.size());
-          for (size_t i = 0; i < infershape_inputs.size(); ++i) {
-            vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
+          // do nothing, skip current attr
+        }
+        break;
+      case phi::AttributeType::INT_ARRAY:
+        // When attr is a vector_tensor or tensor, transform it to IntArray
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS:
+              infer_meta_context.EmplaceBackAttr(std::move(
+                  phi::IntArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+              break;
+            case framework::proto::AttrType::LONGS:
+              infer_meta_context.EmplaceBackAttr(std::move(
+                  phi::IntArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
+              break;
+            case framework::proto::AttrType::INT:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::IntArray({BOOST_GET_CONST(int, attr)}));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to IntArray when "
+                  "construct InferMetaContext.",
+                  attr_name));
           }
-
-          int64_t num_ele = 0;
-          if (vars.size() == 1) {
-            num_ele = 1;
-            const auto& tensor_dims = vars[0]->GetShape();
-            for (size_t i = 0; i < tensor_dims.size(); ++i) {
-              num_ele *= tensor_dims[i];
+        } else if (ctx->HasInputs(attr_name) || ctx->HasInput(attr_name)) {
+          auto infershape_inputs = std::move(ctx->GetInputVarPtrs(attr_name));
+          if (ctx->IsRuntime()) {
+            // If is in runtime, we will get tensor's value for IntArray
+            // and push it into attrs
+            std::vector<Variable*> vars;
+            vars.reserve(infershape_inputs.size());
+            for (size_t i = 0; i < infershape_inputs.size(); i++) {
+              vars.push_back(BOOST_GET_CONST(Variable*, infershape_inputs[i]));
             }
-
-            if (num_ele <= 0) {
-              PADDLE_THROW(platform::errors::Unimplemented(
-                  "Invalid number for construct phi::IntArray, expected "
-                  "number > 0, but actually is %d. ",
-                  num_ele));
+            if (infershape_inputs.size() != 1) {
+              infer_meta_context.EmplaceBackAttr(
+                  std::move(experimental::MakePhiIntArrayFromVarList(vars)));
+            } else {
+              infer_meta_context.EmplaceBackAttr(
+                  std::move(experimental::MakePhiIntArrayFromVar(*vars[0])));
             }
-
           } else {
-            num_ele = vars.size();
+            // If is not in runtime, we will set default value(-1) for IntArray
+            std::vector<VarDesc*> vars;
+            vars.reserve(infershape_inputs.size());
+            for (size_t i = 0; i < infershape_inputs.size(); ++i) {
+              vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
+            }
+
+            int64_t num_ele = 0;
+            if (vars.size() == 1) {
+              num_ele = 1;
+              const auto& tensor_dims = vars[0]->GetShape();
+              for (size_t i = 0; i < tensor_dims.size(); ++i) {
+                num_ele *= tensor_dims[i];
+              }
+
+              if (num_ele <= 0) {
+                num_ele = tensor_dims.size();
+              }
+
+            } else {
+              num_ele = vars.size();
+            }
+            phi::IntArray tensor_attr(std::vector<int32_t>(num_ele, -1));
+            tensor_attr.SetFromTensor(true);
+            infer_meta_context.EmplaceBackAttr(std::move(tensor_attr));
           }
-          phi::IntArray tensor_attr(std::vector<int32_t>(num_ele, -1));
-          tensor_attr.SetFromTensor(true);
-          infer_meta_context.EmplaceBackAttr(std::move(tensor_attr));
-        }
-      } else if (ctx->HasAttr(attr_name)) {
-        auto& attr = attr_reader.GetAttr(attr_name);
-        if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int32_t>))) {
-          infer_meta_context.EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(std::vector<int64_t>))) {
-          infer_meta_context.EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(int))) {
-          infer_meta_context.EmplaceBackAttr(
-              phi::IntArray({BOOST_GET_CONST(int, attr)}));
         } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to IntArray when "
-              "construct InferMetaContext.",
-              attr_name));
+          // do nothing, skip current attr
         }
-      }
-    } else if (attr_defs[i].type_index ==
-               std::type_index(typeid(phi::Scalar))) {
-      if (ctx->HasAttr(attr_name)) {
-        // TODO(chentianyu03): support other attrs later
-        auto& attr = attr_reader.GetAttr(attr_name);
-        if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
-          infer_meta_context.EmplaceBackAttr(
-              phi::Scalar(BOOST_GET_CONST(float, attr)));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(std::string))) {
-          infer_meta_context.EmplaceBackAttr(
-              phi::Scalar(BOOST_GET_CONST(std::string, attr)));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(int))) {
-          infer_meta_context.EmplaceBackAttr(
-              phi::Scalar(BOOST_GET_CONST(int, attr)));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to Scalar when construct "
-              "InferMetaContext.",
-              attr_name));
-        }
-      } else if (ctx->HasInput(attr_name)) {
-        auto infershape_input = std::move(ctx->GetInputVarPtrs(attr_name));
-        if (infershape_input.size() == 1) {
-          if (ctx->IsRuntime()) {
-            Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
-            infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePhiScalarFromVar(*var)));
-          } else {
-            phi::Scalar tensor_scalar(-1);
-            tensor_scalar.SetFromTensor(true);
-            infer_meta_context.EmplaceBackAttr(std::move(tensor_scalar));
+        break;
+      case phi::AttributeType::SCALARS:
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS: {
+              const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::LONGS: {
+              const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::FLOATS: {
+              const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::FLOAT64S: {
+              const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to vector<Scalar> when "
+                  "construct KernelContext.",
+                  attr_names[i]));
           }
         } else {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Invalid input.size() when cast op attribute `%s` to Scalar, "
-              "expected 1, but actually is %d .",
-              attr_name, infershape_input.size()));
-        }
-      }
-    } else if (attr_defs[i].type_index ==
-               std::type_index(typeid(std::vector<phi::Scalar>))) {
-      auto& attr = attr_reader.GetAttr(attr_name);
-      if (std::type_index(attr.type()) ==
-          std::type_index(typeid(std::vector<int32_t>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<int64_t>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<float>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<double>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` to vector<Scalar> when "
-            "construct InferMetaContext.",
-            attr_names[i]));
-      }
-    } else if (ctx->HasAttr(attr_name)) {
-      // Emplace Back Attr according to the type of attr.
-      auto& attr = attr_reader.GetAttr(attr_name);
-      if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
-      } else if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
-      } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
-      } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::string))) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<bool>))) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<bool>, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<int>))) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<int>, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<int64_t>))) {
-        if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int>))) {
-          // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
-          const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
-                                                       vector_int_attr.end());
-          infer_meta_context.EmplaceBackAttr(vector_int64_attr);
-        } else {
-          infer_meta_context.EmplaceBackAttr(
-              BOOST_GET_CONST(std::vector<int64_t>, attr));
+          // do nothing, skip current attr
         }
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<float>))) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<float>, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<double>))) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<double>, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<std::string>))) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<std::string>, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(phi::DataType))) {
-        auto data_type = paddle::framework::TransToPhiDataType(
-            static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr)));
-        infer_meta_context.EmplaceBackAttr(data_type);
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported attribute type is received when call "
-            "InferShapeFunctor."));
-      }
-    } else if (ctx->HasInput(attr_name)) {
-      // convert from data
-      if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) {
-        if (ctx->IsRuntime()) {
-          auto infershape_inputs = std::move(ctx->GetInputVarPtrs(attr_name));
-          auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]);
-          auto val = experimental::MakePhiScalarFromVar(*var_temp);
-          int32_t val_int = val.template to<int32_t>();
-          infer_meta_context.EmplaceBackAttr(val_int);
+        break;
+      default:
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (attr_defs[i].type_index) {
+            case phi::AttributeType::FLOAT32:
+              infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+              break;
+            case phi::AttributeType::INT32:
+              infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+              break;
+            case phi::AttributeType::BOOL:
+              infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+              break;
+            case phi::AttributeType::INT64:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(int64_t, attr));
+              break;
+            case phi::AttributeType::INT32S:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<int>, attr));
+              break;
+            case phi::AttributeType::DATA_TYPE: {
+              auto data_type = paddle::framework::TransToPhiDataType(
+                  static_cast<framework::proto::VarType::Type>(
+                      BOOST_GET_CONST(int, attr)));
+              infer_meta_context.EmplaceBackAttr(data_type);
+            } break;
+            case phi::AttributeType::STRING:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::string, attr));
+              break;
+            case phi::AttributeType::INT64S:
+              switch (AttrTypeID(attr)) {
+                case framework::proto::AttrType::LONGS:
+                  infer_meta_context.EmplaceBackAttr(
+                      BOOST_GET_CONST(std::vector<int64_t>, attr));
+                  break;
+                case framework::proto::AttrType::INTS: {
+                  const auto& vector_int_attr =
+                      BOOST_GET_CONST(std::vector<int>, attr);
+                  const std::vector<int64_t> vector_int64_attr(
+                      vector_int_attr.begin(), vector_int_attr.end());
+                  infer_meta_context.EmplaceBackAttr(vector_int64_attr);
+                } break;
+                default:
+                  PADDLE_THROW(platform::errors::Unimplemented(
+                      "Unsupported cast op attribute `%s` to vector<int64_t> "
+                      "when "
+                      "construct KernelContext.",
+                      attr_names[i]));
+              }
+              break;
+            case phi::AttributeType::FLOAT32S:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<float>, attr));
+              break;
+            case phi::AttributeType::STRINGS:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<std::string>, attr));
+              break;
+            case phi::AttributeType::BOOLS:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<bool>, attr));
+              break;
+            case phi::AttributeType::FLOAT64S:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<double>, attr));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` when construct "
+                  "KernelContext in dygraph.",
+                  attr_names[i]));
+          }
         } else {
-          infer_meta_context.EmplaceBackAttr(-1);
+          // do nothing, skip currnet attr
         }
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Get value from variable only support int yet"));
-      }
     }
   }
 
@@ -689,7 +711,7 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
         infer_meta_context.EmplaceBackOutput(
             std::move(CompatMetaTensor(output_var[0], ctx->IsRuntime())));
       } else {
-        paddle::SmallVector<CompatMetaTensor, phi::kOutputSmallVectorSize>
+        paddle::small_vector<CompatMetaTensor, phi::kOutputSmallVectorSize>
             outputs;
         for (const auto& out : output_var) {
           if (ctx->IsRuntime()) {
diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index e54f2e81e7e9f..855e873b30951 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -100,9 +100,10 @@ class CompatInferMetaContext : public phi::InferMetaContext {
   void EmplaceBackOutput(CompatMetaTensor output);
 
   void EmplaceBackInputs(
-      paddle::SmallVector<CompatMetaTensor, phi::kInputSmallVectorSize> inputs);
+      paddle::small_vector<CompatMetaTensor, phi::kInputSmallVectorSize>
+          inputs);
   void EmplaceBackOutputs(
-      paddle::SmallVector<CompatMetaTensor, phi::kOutputSmallVectorSize>
+      paddle::small_vector<CompatMetaTensor, phi::kOutputSmallVectorSize>
           outputs);
 
   const phi::MetaTensor& InputAt(size_t idx) const override;
@@ -121,9 +122,9 @@ class CompatInferMetaContext : public phi::InferMetaContext {
   virtual ~CompatInferMetaContext() = default;
 
  private:
-  paddle::SmallVector<CompatMetaTensor, phi::kInputSmallVectorSize>
+  paddle::small_vector<CompatMetaTensor, phi::kInputSmallVectorSize>
       compat_inputs_;
-  paddle::SmallVector<CompatMetaTensor, phi::kOutputSmallVectorSize>
+  paddle::small_vector<CompatMetaTensor, phi::kOutputSmallVectorSize>
       compat_outputs_;
 };
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 207ee713bf409..a3b49476d820f 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -159,7 +159,6 @@ if(WITH_IPU)
     pass_library(infer_shape_pass base DIR ipu)
     pass_library(delete_scale_op_pass base DIR ipu)
     pass_library(avg_shard_pass base DIR ipu)
-    pass_library(transfer_cast_op_pass base DIR ipu)
 endif()
 
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
@@ -226,6 +225,7 @@ endif()
     cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
     cc_test(test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc DEPS reshape_transpose_matmul_mkldnn_fuse_pass reshape_transpose_matmul_v2_mkldnn_fuse_pass)
     cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass matmul_v2_transpose_reshape_fuse_pass)
+    cc_test(test_shuffle_channel_mkldnn_detect_pass SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc DEPS shuffle_channel_mkldnn_detect_pass)
     cc_test(test_cpu_bfloat16_placement_pass SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc DEPS cpu_bfloat16_placement_pass)
     cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass)
     cc_test(test_multi_gru_fuse_pass SRCS mkldnn/multi_gru_fuse_pass_tester.cc DEPS multi_gru_fuse_pass)
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 8eb1b64a2763a..fbd8fda131b6d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2665,41 +2665,8 @@ PDNode *patterns::UnsupportedBfloat16::operator()() {
   return op;
 }
 
-PDNode *patterns::LastBfloat16Ops::operator()() {
-  auto *op = pattern->NewNode(op_repr())->assert_is_op();
-  op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "bfloat16";
-  });
-  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
-  op->LinksTo({op_out});
-  return op_out;
-}
-
-PDNode *patterns::FirstBfloat16Ops::operator()() {
-  auto *op_in = pattern->NewNode(op_in_repr())->AsInput();
-
-  auto *op = pattern->NewNode(op_repr())->assert_is_op();
-  op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "bfloat16";
-  });
-
-  op->LinksFrom({op_in});
-  return op;
-}
-
-PDNode *patterns::DuplicatedInputs::operator()() {
-  auto op = pattern->NewNode(op_repr())->assert_is_ops({"concat", "sum"});
-  op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "bfloat16";
-  });
-  return op;
-}
-
-PDNode *patterns::DuplicatedOutputs::operator()() {
-  auto op = pattern->NewNode(op_repr())->assert_is_ops({"split"});
+PDNode *patterns::Bloat16Ops::operator()() {
+  auto op = pattern->NewNode(op_repr())->assert_is_op();
   op->assert_more([&](Node *node) {
     return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
            "bfloat16";
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 434ede6cf7a3b..d7e265fe28bf9 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1565,36 +1565,9 @@ struct UnsupportedBfloat16 : public PatternBase {
   PATTERN_DECL_NODE(op);
 };
 
-struct LastBfloat16Ops : public PatternBase {
-  LastBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "last_bfloat16_ops") {}
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(op);
-  PATTERN_DECL_NODE(op_out);
-};
-
-struct FirstBfloat16Ops : public PatternBase {
-  FirstBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "first_bfloat16_ops") {}
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(op_in);
-  PATTERN_DECL_NODE(op);
-};
-
-struct DuplicatedInputs : public PatternBase {
-  DuplicatedInputs(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "many_inputs_op") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(op);
-};
-
-struct DuplicatedOutputs : public PatternBase {
-  DuplicatedOutputs(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "many_outputs_op") {}
+struct Bloat16Ops : public PatternBase {
+  Bloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "many_bfloat16_ops") {}
 
   PDNode* operator()();
 
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
index 02f000acc2a39..a6b82089dc4df 100644
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
@@ -121,9 +121,9 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
   }
 
   // Run passes
-  std::vector<std::string> graph_pass = {
-      "forward_graph_extract_pass", "infer_shape_pass", "avg_shard_pass",
-      "popart_canonicalization_pass", "transfer_cast_op_pass"};
+  std::vector<std::string> graph_pass = {"forward_graph_extract_pass",
+                                         "infer_shape_pass", "avg_shard_pass",
+                                         "popart_canonicalization_pass"};
   std::vector<std::string> compile_pass = {
       "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass",
       "inference_postprocess_pass"};
diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
deleted file mode 100644
index 5cd8358dc083e..0000000000000
--- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.h"
-
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// Transfer the target dtype of Cast Op to FP16 if the original target is FP32
-// and enable FP16 mode.
-void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const {
-  VLOG(10) << "enter TransferCastOpPass::ApplyImpl";
-  VLOG(10) << "Raw Graph: ";
-  VLOG(10) << DebugString(graph);
-
-  auto ipu_backend = platform::ipu::IpuBackend::GetInstance();
-  auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16;
-  auto transfer_cast_op = ipu_backend->GetIpuStrategy()->transfer_cast_op;
-  if (enable_fp16 && transfer_cast_op) {
-    for (auto* node : graph->Nodes()) {
-      if (node->IsOp() && node->Op()->Type() == "popart_cast") {
-        if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) ==
-            "FLOAT") {
-          node->Op()->SetAttr("to", std::string("FLOAT16"));
-        }
-      }
-    }
-  }
-
-  VLOG(10) << "Post Graph: ";
-  VLOG(10) << DebugString(graph);
-  VLOG(10) << "leave TransferCastOpPass::ApplyImpl";
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(transfer_cast_op_pass, paddle::framework::ir::TransferCastOpPass);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
index 0ed2ec51b89cb..680dad5cc6b20 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
@@ -178,9 +178,11 @@ void InplaceAddToOpPass::Run(Graph *graph) const {
     auto *out_generated_op = dynamic_cast<details::ComputationOpHandle *>(
         out_var_ptr->GeneratedOp());
 
-    // NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
+    // FIXME(zengjinle): the "custom_fused_dense_grad" is only used for
+    // MLPerf temporarily. Replace it with the formal op type in the future.
     if (right_generated_op->Name() != "conv2d_grad" &&
-        right_generated_op->Name() != "resnet_unit_grad") {
+        right_generated_op->Name() != "resnet_unit_grad" &&
+        right_generated_op->Name() != "custom_fused_dense_grad") {
       continue;
     }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index f1bd34a5ad4f6..62b2be712beef 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -22,290 +22,226 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-using string::PrettyLogDetail;
+namespace {
+class Quanter {
+ public:
+  void AddQuantOps() {
+    if (IsNotPermittedOpType()) return;
 
-void UnlinkNodes(ir::Node* a, ir::Node* b) {
-  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
-                   a->outputs.end());
-  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
-                  b->inputs.end());
-}
+    std::vector<std::string> linked_xputs;
 
-// Checking whether a reorder from FP32 to BF16 should be added before the input
-// to the operator
-bool IsPermittedInputName(const std::string& input_name) {
-  // Only the inputs listed in \"permitted_names\" requires quanitization before
-  // the bfloat16 operator. Other inputs, such as Filter and Bias are reordered
-  // in the kernel.
-  const std::vector<std::string> permitted_names = {"X", "Y", "Input",
-                                                    "ResidualData"};
-  return (std::find(permitted_names.begin(), permitted_names.end(),
-                    input_name) != permitted_names.end());
-}
+    for (const auto& logical_xput : op_xputs) {
+      std::vector<std::string> quant_xput_names;
+      quant_xput_names.reserve(xputs_map.size());
 
-// Checking whether a reorder from BF16 to FP32 should be added after the output
-// to the operator
-bool IsPermittedOutputName(const std::string& output_name) {
-  // XShape is output in transpose2 and reshape2 operators used to store the
-  // shape and lod of X. So this output do not need dequantize before.
-  return (output_name != "XShape");
-}
+      const auto& logical_xput_name = logical_xput.first;
+      if (IsNotPermittedName(logical_xput_name)) continue;
 
-void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in,
-                 int& quantize_counter) {
-  std::vector<std::string> input_names;
-
-  // Find the name of the input linking op to op_in
-  for (auto name : op->Op()->InputNames())
-    for (auto input_name : op->Op()->Input(name))
-      if (input_name == op_in->Name() && IsPermittedInputName(name))
-        input_names.push_back(name);
-
-  if (input_names.empty()) return;
-
-  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
-  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
-
-  OpDesc q_desc;
-  q_desc.SetType("quantize");
-  q_desc.SetInput("Input", std::vector<std::string>({op_in->Name()}));
-  q_desc.SetOutput("Output",
-                   std::vector<std::string>({quantize_out_node->Name()}));
-  q_desc.SetAttr("Scale", 1.f);
-  q_desc.SetAttr("Shift", 0.0f);
-  q_desc.SetAttr("bfloat16", true);
-  q_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
-                                      ? op->Op()->GetAttr("data_layout")
-                                      : std::string("NCHW"));
-  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-
-  for (auto name = input_names.begin(); name < input_names.end(); name++)
-    op->Op()->SetInput(*name,
-                       std::vector<std::string>({quantize_out_node->Name()}));
-
-  UnlinkNodes(op_in, op);
-  IR_NODE_LINK_TO(op_in, quantize_op);
-  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
-  IR_NODE_LINK_TO(quantize_out_node, op);
-  quantize_counter++;
-}
+      const auto& physical_xputs_names = logical_xput.second;
+      for (const auto& physical_xput_name : physical_xputs_names) {
+        if (IsAlreadyLinked(linked_xputs, physical_xput_name)) continue;
 
-void AddQuantizes(Graph* g, ir::Node* op, int& quantize_counter) {
-  auto inputs = op->inputs;
-  PADDLE_ENFORCE_GE(inputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s inputs(%d) must be equal or greater than 1.",
-                        op->Name(), inputs.size()));
-  PADDLE_ENFORCE_EQ(op->outputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s outputs(%d) must be equal to 1.", op->Name(),
-                        op->outputs.size()));
-
-  OpDesc q_desc;
-  q_desc.SetType("quantize");
-
-  std::vector<Node*> quantize_out_nodes(inputs.size());
-  std::vector<std::string> quantize_out_node_names(inputs.size());
-
-  for (size_t i = 0; i < inputs.size(); i++) {
-    VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
-    quantize_out_nodes[i] = g->CreateVarNode(&quantize_out_desc);
-    quantize_out_node_names[i] = quantize_out_nodes[i]->Name();
-
-    q_desc.SetInput("Input", std::vector<std::string>({inputs[i]->Name()}));
-    q_desc.SetOutput("Output",
-                     std::vector<std::string>({quantize_out_node_names[i]}));
-    q_desc.SetAttr("Scale", 1.f);
-    q_desc.SetAttr("Shift", 0.0f);
-    q_desc.SetAttr("bfloat16", true);
-    q_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
-                                        ? op->Op()->GetAttr("data_layout")
-                                        : std::string("NCHW"));
-    auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-
-    UnlinkNodes(inputs[i], op);
-    IR_NODE_LINK_TO(inputs[i], quantize_op);
-    IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]);
-    IR_NODE_LINK_TO(quantize_out_nodes[i], op);
-    quantize_counter++;
+        VarDesc quant_x_desc(
+            patterns::PDNodeName(get_op_type(), get_op_edge()));
+        auto quant_x_node = graph.CreateVarNode(&quant_x_desc);
+        const auto xput_name = quant_x_node->Name();
+        quant_xput_names.emplace_back(xput_name);
+
+        auto quant_op = create_quant_op(physical_xput_name, xput_name);
+
+        auto physical_xput_node = xputs_map[physical_xput_name];
+        link_nodes(physical_xput_node, quant_op, quant_x_node);
+        counter++;
+        linked_xputs.push_back(physical_xput_name);
+      }
+
+      set_edge(logical_xput_name, quant_xput_names);
+    }
   }
 
-  op->Op()->SetInput("X", quantize_out_node_names);
-}
+  int get_counter() const { return counter; }
 
-// Operators like Concat and Sum have a single input name X, which actually
-// consists of multiple inputs. Such operators require a different way to find
-// pattern and add quantize ops.
-void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int& quantize_counter) {
-  GraphPatternDetector gpd;
-  patterns::DuplicatedInputs duplicated_inputs{gpd.mutable_pattern(),
-                                               "duplicated_inputs"};
-  duplicated_inputs();
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_inputs);
-    AddQuantizes(g, op, quantize_counter);
+  virtual ~Quanter() = default;
+
+ protected:
+  Graph& graph;
+  ir::Node* const op;
+
+  std::map<std::string, ir::Node*> xputs_map;
+  const VariableNameMap& op_xputs;
+
+  int counter = 0;
+
+  Quanter(Graph& graph, ir::Node* const op, const VariableNameMap& op_xputs)
+      : graph(graph), op(op), op_xputs(op_xputs){};
+
+  virtual bool IsNotPermittedOpType() const = 0;
+  virtual bool IsNotPermittedName(const std::string& input_name) const = 0;
+  virtual std::string get_op_type() const = 0;
+  virtual std::string get_op_edge() const = 0;
+  virtual void link_nodes(ir::Node* const physical_xput_node,
+                          ir::Node* const quant_op,
+                          ir::Node* const quant_x_node) = 0;
+  virtual void set_edge(const std::string& logical_xput_name,
+                        const std::vector<std::string>& quant_xput_names) = 0;
+
+  bool IsAlreadyLinked(const std::vector<std::string>& node_names,
+                       const std::string& node_name) const {
+    return std::find(node_names.begin(), node_names.end(), node_name) !=
+           node_names.end();
+  }
+
+  virtual ir::Node* create_quant_op(const std::string& input_name,
+                                    const std::string& output_name) const {
+    OpDesc op_desc;
+    op_desc.SetType(get_op_type());
+
+    op_desc.SetInput("Input", std::vector<std::string>({input_name}));
+    op_desc.SetOutput("Output", std::vector<std::string>({output_name}));
+    op_desc.SetAttr("Scale", 1.f);
+    op_desc.SetAttr("Shift", 0.0f);
+    op_desc.SetAttr("bfloat16", true);
+    op_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
+                                         ? op->Op()->GetAttr("data_layout")
+                                         : std::string("NCHW"));
+    return graph.CreateOpNode(&op_desc);  // OpDesc will be copied.
+  }
+
+  void UnlinkNodes(ir::Node* a, ir::Node* b) const {
+    a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                     a->outputs.end());
+    b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                    b->inputs.end());
+  }
+};
+
+class Quantizer final : public Quanter {
+ public:
+  Quantizer(Graph* const graph, ir::Node* const op)
+      : Quanter(*graph, op, op->Op()->Inputs()) {
+    auto inputs = op->inputs;
+    PADDLE_ENFORCE_GE(
+        inputs.size(), 1,
+        platform::errors::InvalidArgument(
+            "OP(%s)'s inputs(%d) must be equal or greater than 1.", op->Name(),
+            inputs.size()));
+
+    for (auto input : inputs) xputs_map[input->Name()] = input;
   };
-  gpd(graph, handler);
-}
 
-// Adding quantize ops before all operators except Concat and Sum, which have
-// already been handled in AddReoderBeforeDuplicatedInputs
-void AddReoderBeforeSingleInputs(ir::Graph* graph, int& quantize_counter) {
-  GraphPatternDetector gpd;
-  patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
-                                          "first_bfloat16_ops"};
-  bfloat16_ops();
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_ops);
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
-    if (op->Op()->Type() != "sum" && op->Op()->Type() != "concat") {
-      AddQuantize(g, op, op_in, quantize_counter);
-    }
+ protected:
+  bool IsNotPermittedOpType() const override { return false; }
+
+  // Checking whether a reorder from FP32 to BF16
+  // should be added before the input to the operator
+  bool IsNotPermittedName(const std::string& input_name) const override {
+    // Only the inputs listed in \"permitted_names\"
+    // requires quanitization before the bfloat16 operator.
+    // Other inputs, such as Filter and Bias are reordered in the kernel.
+    const std::vector<std::string> permitted_names = {"X", "Y", "Input",
+                                                      "ResidualData"};
+
+    return std::none_of(
+        permitted_names.begin(), permitted_names.end(),
+        [&input_name](const std::string& name) { return name == input_name; });
+  }
+
+  std::string get_op_type() const override { return "quantize"; };
+  std::string get_op_edge() const override { return "out"; };
+
+  void link_nodes(ir::Node* const physical_xput_node, ir::Node* const quant_op,
+                  ir::Node* const quant_x_node) override {
+    UnlinkNodes(physical_xput_node, op);
+    IR_NODE_LINK_TO(physical_xput_node, quant_op);
+    IR_NODE_LINK_TO(quant_op, quant_x_node);
+    IR_NODE_LINK_TO(quant_x_node, op);
+  }
+
+  void set_edge(const std::string& logical_xput_name,
+                const std::vector<std::string>& quant_xput_names) override {
+    op->Op()->SetInput(logical_xput_name, quant_xput_names);
+  }
+};
+
+class DeQuantizer final : public Quanter {
+ public:
+  DeQuantizer(Graph* const graph, ir::Node* const op)
+      : Quanter(*graph, op, op->Op()->Outputs()) {
+    auto outputs = op->outputs;
+    PADDLE_ENFORCE_GE(
+        outputs.size(), 1,
+        platform::errors::InvalidArgument(
+            "OP(%s)'s outputs(%d) must be equal or greater than 1.", op->Name(),
+            outputs.size()));
+
+    for (auto output : outputs) xputs_map[output->Name()] = output;
   };
-  gpd(graph, handler);
-}
 
-void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const {
-  int quantize_counter = 0;
-  AddReoderBeforeDuplicatedInputs(graph, quantize_counter);
-  AddReoderBeforeSingleInputs(graph, quantize_counter);
-  PrettyLogDetail("---    added %d quantize ops before bfloat16 op",
-                  quantize_counter);
-}
+ protected:
+  bool IsNotPermittedOpType() const override {
+    // Prior_box operator output is always FP32 so no dequantization is needed.
+    return op->Op()->Type() == "prior_box";
+  }
 
-void AddDequantize(Graph* g, ir::Node* op, ir::Node* op_out,
-                   int& dequantize_counter) {
-  if (op->Op()->Type() == "prior_box") return;
-
-  // Find the name of the output linking op to op_out
-  std::vector<std::string> output_names;
-  for (auto name : op->Op()->OutputNames())
-    for (auto output_name : op->Op()->Output(name))
-      if (output_name == op_out->Name() && IsPermittedOutputName(name))
-        output_names.push_back(name);
-
-  if (output_names.empty()) return;
-
-  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
-  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
-
-  OpDesc deq_desc;
-  deq_desc.SetType("dequantize");
-  deq_desc.SetInput("Input",
-                    std::vector<std::string>({dequantize_in_node->Name()}));
-  deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
-  deq_desc.SetAttr("Scale", 1.0f);
-  deq_desc.SetAttr("Shift", 0.0f);
-  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-
-  for (auto name = output_names.begin(); name < output_names.end(); name++)
-    op->Op()->SetOutput(*name,
-                        std::vector<std::string>({dequantize_in_node->Name()}));
-
-  UnlinkNodes(op, op_out);
-  IR_NODE_LINK_TO(op, dequantize_in_node);
-  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
-  IR_NODE_LINK_TO(dequantize_op, op_out);
-
-  dequantize_counter++;
-}
+  // Checking whether a reorder from BF16 to FP32
+  // should be added after the output to the operator
+  bool IsNotPermittedName(const std::string& output_name) const override {
+    // XShape is output in transpose2 and reshape2 operators used to store the
+    // shape and lod of X. So this output do not need dequantize before.
+    return (output_name == "XShape");
+  }
+
+  std::string get_op_type() const override { return "dequantize"; };
+  std::string get_op_edge() const override { return "in"; };
 
-void AddDequantizes(Graph* g, ir::Node* op, int& dequantize_counter) {
-  auto outputs = op->outputs;
-  PADDLE_ENFORCE_GE(outputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s outputs(%d) must be equal or greater than 1.",
-                        op->Name(), outputs.size()));
-  PADDLE_ENFORCE_EQ(op->inputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s inputs(%d) must be equal to 1.", op->Name(),
-                        op->inputs.size()));
-
-  OpDesc deq_desc;
-  deq_desc.SetType("dequantize");
-
-  std::vector<Node*> dequantize_in_nodes(outputs.size());
-  std::vector<std::string> dequantize_in_node_names(outputs.size());
-
-  for (size_t i = 0; i < outputs.size(); i++) {
-    VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
-    dequantize_in_nodes[i] = g->CreateVarNode(&dequantize_in_desc);
-    dequantize_in_node_names[i] = dequantize_in_nodes[i]->Name();
-
-    deq_desc.SetInput("Input",
-                      std::vector<std::string>({dequantize_in_node_names[i]}));
-    deq_desc.SetOutput("Output",
-                       std::vector<std::string>({outputs[i]->Name()}));
-
-    deq_desc.SetAttr("Scale", 1.f);
-    deq_desc.SetAttr("Shift", 0.0f);
-    deq_desc.SetAttr("bfloat16", true);
-    deq_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
-                                          ? op->Op()->GetAttr("data_layout")
-                                          : std::string("NCHW"));
-    auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-
-    UnlinkNodes(op, outputs[i]);
-    IR_NODE_LINK_TO(op, dequantize_in_nodes[i]);
-    IR_NODE_LINK_TO(dequantize_in_nodes[i], dequantize_op);
-    IR_NODE_LINK_TO(dequantize_op, outputs[i]);
-
-    dequantize_counter++;
+  void link_nodes(ir::Node* const physical_xput_node, ir::Node* const quant_op,
+                  ir::Node* const quant_x_node) override {
+    UnlinkNodes(op, physical_xput_node);
+    IR_NODE_LINK_TO(quant_op, physical_xput_node);
+    IR_NODE_LINK_TO(quant_x_node, quant_op);
+    IR_NODE_LINK_TO(op, quant_x_node);
   }
 
-  op->Op()->SetOutput("Out", dequantize_in_node_names);
-}
+  void set_edge(const std::string& logical_xput_name,
+                const std::vector<std::string>& quant_xput_names) override {
+    op->Op()->SetOutput(logical_xput_name, quant_xput_names);
+  }
 
-// Operators like split have a single output name Out, which actually
-// consists of multiple outputs. Such operators require a different way to find
-// pattern and add dequantize ops.
-void AddReoderAfterDuplicatedOutputs(ir::Graph* graph,
-                                     int& dequantize_counter) {
-  GraphPatternDetector gpd;
-  patterns::DuplicatedOutputs duplicated_outputs{gpd.mutable_pattern(),
-                                                 "duplicated_outputs"};
-  duplicated_outputs();
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_outputs);
-    AddDequantizes(g, op, dequantize_counter);
-  };
-  gpd(graph, handler);
+  ir::Node* create_quant_op(const std::string& input_name,
+                            const std::string& output_name) const override {
+    return Quanter::create_quant_op(output_name, input_name);
+  }
+};
 }
+using string::PrettyLogDetail;
+
+void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
+  int quantize_counter = 0;
+  int dequantize_counter = 0;
 
-// Adding dequantize ops after all operators except split, which has
-// already been handled in AddReoderAfterDuplicatedOutputs
-void AddReoderAfterSingleOutputs(ir::Graph* graph, int& dequantize_counter) {
   GraphPatternDetector gpd;
-  patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
-                                         "last_bfloat16_ops"};
-  bfloat16_ops();
+  patterns::Bloat16Ops Bloat16Ops{gpd.mutable_pattern(), "Bloat16Ops"};
+  Bloat16Ops();
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops);
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
-    if (op->Op()->Type() != "split") {
-      AddDequantize(g, op, op_out, dequantize_counter);
-    }
+                     Graph* graph) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, Bloat16Ops);
+
+    Quantizer quantizer(graph, op);
+    quantizer.AddQuantOps();
+    quantize_counter += quantizer.get_counter();
+
+    DeQuantizer dequantizer(graph, op);
+    dequantizer.AddQuantOps();
+    dequantize_counter += dequantizer.get_counter();
   };
   gpd(graph, handler);
-}
 
-void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
-  int dequantize_counter = 0;
-  AddReoderAfterDuplicatedOutputs(graph, dequantize_counter);
-  AddReoderAfterSingleOutputs(graph, dequantize_counter);
+  PrettyLogDetail("---    added %d quantize ops before bfloat16 op",
+                  quantize_counter);
   PrettyLogDetail("---    added %d dequantize ops after bfloat16 op",
                   dequantize_counter);
 }
 
-void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
-  SetInputDataType(graph);
-  SetOutputDataType(graph);
-}
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
index 3a7271f7ddc59..69c7ce35162ff 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
@@ -24,8 +24,6 @@ namespace ir {
 
 class CPUBFloat16Pass : public Pass {
  protected:
-  void SetInputDataType(ir::Graph* graph) const;
-  void SetOutputDataType(ir::Graph* graph) const;
   void ApplyImpl(ir::Graph* graph) const override;
 };
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
index d89891ec3c857..fc7a53c4e7923 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -27,8 +27,16 @@ namespace ir {
 
 using string::PrettyLogDetail;
 
-void CPUBfloat16PlacementPass::SetMkldnnDataType(
-    ir::Graph* graph, int* bfloat16_operators) const {
+void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
+  int bfloat16_operators = 0;
+  bfloat16_operators += SetMkldnnDataType(graph);
+  bfloat16_operators -= RemoveOrphanedOperators(graph);
+  bfloat16_operators -= RemoveUnsupportedOperators(graph);
+  PrettyLogDetail("---    marked %d operators to bfloat16 ",
+                  bfloat16_operators);
+}
+
+int CPUBfloat16PlacementPass::SetMkldnnDataType(ir::Graph* graph) const {
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("bfloat16_enabled_op_types");
   // set mkldnn_data_type to bfloat16 to all operators that are in
@@ -39,6 +47,7 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
                                                          "bfloat16_placement"};
   bfloat16_placement_pattern(op_types_list);
 
+  int detected_operators = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_placement_pattern);
@@ -50,58 +59,58 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
     if ((op->Op()->HasAttr("mkldnn_data_type") ||
          op->Op()->HasProtoAttr("mkldnn_data_type")) &&
         !platform::HasOpINT8DataType(op->Op())) {
+      VLOG(4) << "---    marked " << op->Op()->Type()
+              << " operator to bfloat16 ";
       op->Op()->SetAttr("mkldnn_data_type", std::string("bfloat16"));
-      (*bfloat16_operators)++;
+      detected_operators++;
     }
   };
   gpd(graph, handler);
+  return detected_operators;
 }
 
-void CPUBfloat16PlacementPass::RemoveOrphanedOperators(
-    ir::Graph* graph, int* bfloat16_operators) const {
+int CPUBfloat16PlacementPass::RemoveOrphanedOperators(ir::Graph* graph) const {
   // find orphaned bfloat16 operator that is between two float32 operators
   // revert mkldnn_data_type attr to float32
   GraphPatternDetector gpd;
   patterns::OrphanedBfloat16 orphaned_bfloat16_pattern{gpd.mutable_pattern(),
                                                        "orphaned_bfloat16"};
   orphaned_bfloat16_pattern();
+  int detected_operators = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(op, op, orphaned_bfloat16_pattern);
 
     op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
-    bfloat16_operators--;
+    VLOG(4) << "---  demarked " << op->Op()->Type() << " operator to bfloat16 ";
+    detected_operators++;
   };
   gpd(graph, handler);
+  return detected_operators;
 }
 
-void CPUBfloat16PlacementPass::RemoveUnsupportedOperators(
-    ir::Graph* graph, int* bfloat16_operators) const {
+int CPUBfloat16PlacementPass::RemoveUnsupportedOperators(
+    ir::Graph* graph) const {
   // now quantize is supported FP32 only, so try to find
   // bfloat16 operator that input type is not FP32
   GraphPatternDetector gpd;
   patterns::UnsupportedBfloat16 unsupported_bfloat16_pattern{
       gpd.mutable_pattern(), "unsupported_bfloat16"};
   unsupported_bfloat16_pattern();
+  int detected_operators = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(prev_out, prev_out, unsupported_bfloat16_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(op, op, unsupported_bfloat16_pattern);
     if ((prev_out->Var()->GetDataType() != proto::VarType::FP32)) {
       op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
-      bfloat16_operators--;
+      VLOG(4) << "---  demarked " << op->Op()->Type()
+              << " operator to bfloat16 ";
+      detected_operators++;
     }
   };
   gpd(graph, handler);
-}
-
-void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
-  int bfloat16_operators = 0;
-  SetMkldnnDataType(graph, &bfloat16_operators);
-  RemoveOrphanedOperators(graph, &bfloat16_operators);
-  RemoveUnsupportedOperators(graph, &bfloat16_operators);
-  PrettyLogDetail("---    marked %d operators to bfloat16 ",
-                  bfloat16_operators);
+  return detected_operators;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
index facc4c4c55221..63848298a879a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
@@ -26,14 +26,11 @@ namespace ir {
  */
 class CPUBfloat16PlacementPass : public Pass {
  protected:
-  void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const;
-
-  void RemoveOrphanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
-
-  void RemoveUnsupportedOperators(ir::Graph* graph,
-                                  int* bfloat16_operators) const;
-
   void ApplyImpl(ir::Graph* graph) const override;
+
+  int SetMkldnnDataType(ir::Graph* graph) const;
+  int RemoveOrphanedOperators(ir::Graph* graph) const;
+  int RemoveUnsupportedOperators(ir::Graph* graph) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
new file mode 100644
index 0000000000000..fe42e8f96f851
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AddVarToScope(Scope* param_scope, const std::string& name,
+                   const DDim& dims) {
+  auto* tensor = param_scope->Var(name)->GetMutable<LoDTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(platform::CPUPlace());
+}
+
+Scope* CreateParamScope() {
+  auto param_scope = new Scope();
+  AddVarToScope(param_scope, "prog_x", {1, 128, 52, 52});
+  return param_scope;
+}
+
+void MainTest() {
+  Layers layers;
+  auto prog_x = layers.data("prog_x", {1, 128, 52, 52});
+  auto first_reshape2 = layers.reshape2(prog_x, {-1, 2, 64, 52, 52}, true);
+  first_reshape2->SetShape({-1, 2, 64, 52, 52});
+  auto transpose2 = layers.transpose2(first_reshape2, {0, 2, 1, 3, 4}, true);
+  transpose2->SetShape({-1, 64, 2, 52, 52});
+  auto second_reshape2 = layers.reshape2(transpose2, {-1, 128, 52, 52}, true);
+  second_reshape2->SetShape({-1, 128, 52, 52});
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+
+  int added_nodes = 1;    // shuffle_channel
+  int removed_nodes = 5;  // 2 * reshape, reshape_out, transpose, transpose_out
+
+  int original_nodes_num = graph->Nodes().size();
+  auto pass =
+      PassRegistry::Instance().Get("shuffle_channel_mkldnn_detect_pass");
+  graph.reset(pass->Apply(graph.release()));
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_EQ(current_nodes_num,
+            original_nodes_num + added_nodes - removed_nodes);
+  EXPECT_EQ(GetNumOpNodes(graph, "reshape2"), 0);
+  EXPECT_EQ(GetNumOpNodes(graph, "transpose2"), 0);
+  EXPECT_EQ(GetNumOpNodes(graph, "shuffle_channel"), 1);
+
+  for (const auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "shuffle_channel") {
+      const auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(BOOST_GET_CONST(bool, op->GetAttr("use_mkldnn")));
+    }
+  }
+}
+
+TEST(ShuffleChannelOneDNNDetectPass, ShuffleChannelOneDNNDetectPassTest) {
+  MainTest();
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(shuffle_channel_mkldnn_detect_pass);
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 61cd7ad01696e..7a83fdccc218c 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -34,7 +34,6 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
   mpi_rank_ = trainer_desc.mpi_rank();
   mpi_size_ = trainer_desc.mpi_size();
   dump_file_num_ = trainer_desc.dump_file_num();
-
   for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
        i++) {
     need_merge_var_names_.push_back(
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 0164c45307649..c75a7871d63e9 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -328,21 +328,21 @@ bool InterpretercoreInferShapeContext::IsRunMKLDNNKernel() const {
 }
 
 // TODO(paddle-dev): Can this be template?
-paddle::SmallVector<InferShapeVarPtr, phi::kInputSmallVectorSize>
+paddle::small_vector<InferShapeVarPtr, phi::kInputSmallVectorSize>
 InterpretercoreInferShapeContext::GetInputVarPtrs(
     const std::string& name) const {
   const std::vector<Variable*>& vars = InputVars(name);
-  paddle::SmallVector<InferShapeVarPtr, phi::kInputSmallVectorSize> res;
+  paddle::small_vector<InferShapeVarPtr, phi::kInputSmallVectorSize> res;
   res.reserve(vars.size());
   res.insert(res.begin(), vars.begin(), vars.end());
   return res;
 }
 
-paddle::SmallVector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
+paddle::small_vector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
 InterpretercoreInferShapeContext::GetOutputVarPtrs(
     const std::string& name) const {
   const std::vector<Variable*>& vars = OutputVars(name);
-  paddle::SmallVector<InferShapeVarPtr, phi::kOutputSmallVectorSize> res;
+  paddle::small_vector<InferShapeVarPtr, phi::kOutputSmallVectorSize> res;
   res.reserve(vars.size());
   res.insert(res.begin(), vars.begin(), vars.end());
   return res;
@@ -365,6 +365,11 @@ std::vector<DDim> InterpretercoreInferShapeContext::GetInputsDim(
   return GetDims(vars);
 }
 
+proto::VarType::Type InterpretercoreInferShapeContext::GetInputVarType(
+    const std::string& name) const {
+  return GetVarType(InputVars(name).at(0));
+}
+
 std::vector<proto::VarType::Type>
 InterpretercoreInferShapeContext::GetInputsVarType(
     const std::string& name) const {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 83eaf9514a136..20e51145a51b2 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -90,16 +90,18 @@ class InterpretercoreInferShapeContext : public InferShapeContext {
   bool IsRunMKLDNNKernel() const override;
 
   // TODO(paddle-dev): Can this be template?
-  paddle::SmallVector<InferShapeVarPtr, phi::kInputSmallVectorSize>
+  paddle::small_vector<InferShapeVarPtr, phi::kInputSmallVectorSize>
   GetInputVarPtrs(const std::string& name) const override;
 
-  paddle::SmallVector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
+  paddle::small_vector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
   GetOutputVarPtrs(const std::string& name) const override;
 
   DDim GetInputDim(const std::string& name) const override;
 
   std::vector<DDim> GetInputsDim(const std::string& name) const override;
 
+  proto::VarType::Type GetInputVarType(const std::string& name) const override;
+
   std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string& name) const override;
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index e03277fb31799..23bd777fae1d5 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -74,11 +74,12 @@ PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_XPU_KP
 PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #else
 PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, KPS, ALL_LAYOUT);
 #endif
 PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 4ef1d3a83a267..87d3a048d0be0 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -202,10 +202,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     }
   }
 
-  paddle::SmallVector<InferShapeVarPtr, phi::kInputSmallVectorSize>
+  paddle::small_vector<InferShapeVarPtr, phi::kInputSmallVectorSize>
   GetInputVarPtrs(const std::string &name) const override {
     const std::vector<std::string> arg_names = Inputs(name);
-    paddle::SmallVector<InferShapeVarPtr, phi::kInputSmallVectorSize> res;
+    paddle::small_vector<InferShapeVarPtr, phi::kInputSmallVectorSize> res;
     res.reserve(arg_names.size());
     std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
                    [this](const std::string &name) {
@@ -214,10 +214,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     return res;
   }
 
-  paddle::SmallVector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
+  paddle::small_vector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
   GetOutputVarPtrs(const std::string &name) const override {
     const std::vector<std::string> arg_names = Outputs(name);
-    paddle::SmallVector<InferShapeVarPtr, phi::kOutputSmallVectorSize> res;
+    paddle::small_vector<InferShapeVarPtr, phi::kOutputSmallVectorSize> res;
     res.reserve(arg_names.size());
     std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
                    [this](const std::string &name) {
@@ -245,6 +245,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   bool IsRunMKLDNNKernel() const override;
 
+  proto::VarType::Type GetInputVarType(const std::string &name) const override {
+    return GetVarType(Inputs(name).at(0));
+  }
+
   std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string &name) const override {
     return GetVarTypes(Inputs(name));
diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
index d69edef7840f5..d14254b7355c9 100644
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -21,13 +21,17 @@ namespace framework {
 
 std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
     const std::string& type, const VariableNameMap& inputs,
-    const VariableNameMap& outputs, AttributeMap attrs, bool attr_check) {
+    const VariableNameMap& outputs, const AttributeMap& attrs,
+    bool attr_check) {
   auto& info = OpInfoMap::Instance().Get(type);
   if (attr_check && info.Checker() != nullptr) {
-    info.Checker()->Check(&attrs);
+    auto tmp_attrs = attrs;
+    info.Checker()->Check(&tmp_attrs);
+    return std::unique_ptr<OperatorBase>(
+        info.Creator()(type, inputs, outputs, tmp_attrs));
   }
-  auto op = info.Creator()(type, inputs, outputs, attrs);
-  return std::unique_ptr<OperatorBase>(op);
+  return std::unique_ptr<OperatorBase>(
+      info.Creator()(type, inputs, outputs, attrs));
 }
 
 static VariableNameMap ConvertOpDescVarsToVarNameMap(
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index eb40a49b4066a..a1f07f9f2520e 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -129,7 +129,7 @@ class OpRegistry {
   static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
                                                 const VariableNameMap& inputs,
                                                 const VariableNameMap& outputs,
-                                                AttributeMap attrs,
+                                                const AttributeMap& attrs,
                                                 bool attr_check = true);
 
   static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 140103b10592f..18287f0c7a4ee 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -946,19 +943,19 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   // TODO(paddle-dev): Can this be template?
-  paddle::SmallVector<InferShapeVarPtr, phi::kInputSmallVectorSize>
+  paddle::small_vector<InferShapeVarPtr, phi::kInputSmallVectorSize>
   GetInputVarPtrs(const std::string& name) const override {
     const std::vector<Variable*>& vars = InputVars(name);
-    paddle::SmallVector<InferShapeVarPtr, phi::kInputSmallVectorSize> res;
+    paddle::small_vector<InferShapeVarPtr, phi::kInputSmallVectorSize> res;
     res.reserve(vars.size());
     res.insert(res.begin(), vars.begin(), vars.end());
     return res;
   }
 
-  paddle::SmallVector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
+  paddle::small_vector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
   GetOutputVarPtrs(const std::string& name) const override {
     const std::vector<Variable*>& vars = OutputVars(name);
-    paddle::SmallVector<InferShapeVarPtr, phi::kOutputSmallVectorSize> res;
+    paddle::small_vector<InferShapeVarPtr, phi::kOutputSmallVectorSize> res;
     res.reserve(vars.size());
     res.insert(res.begin(), vars.begin(), vars.end());
     return res;
@@ -979,6 +976,10 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return GetDims(vars);
   }
 
+  proto::VarType::Type GetInputVarType(const std::string& name) const override {
+    return GetVarType(InputVars(name).at(0));
+  }
+
   std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string& name) const override {
     return GetVarTypes(InputVars(name));
@@ -1277,6 +1278,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     dev_ctx = pool.Get(kernel_type_->place_);
   }
 
+// TODO(Liu-xiandong): Now we are using too much if-else and hard code in XPU
+// device, it's ugly, and we will refactor in the future.
+#if defined(PADDLE_WITH_XPU_KP)
+  bool use_phi_xpu_kp = false;
+#endif
+
   // TODO(chenweihang): Now we are still reusing a lot of the original fluid
   // implementation, this is a gradual replacement process
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
@@ -1295,6 +1302,45 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       dev_ctx = pool.Get(kernel_type_->place_);
 
       pt_kernel_name = kernel_signature_->name;
+// NOTE(Liu-xiandong): The register kernel used KP have library_type[KP],
+// But the default library_type is Plain, so we need to modify the
+// library_type here, otherwise it can't work.
+#ifdef PADDLE_WITH_XPU_KP
+      if (paddle::platform::is_xpu_place(kernel_type_->place_)) {
+        bool use_xpu_kp_kernel_rt =
+            FLAGS_run_kp_kernel &&
+            paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
+        bool use_xpu_kp_kernel_debug =
+            paddle::platform::is_in_xpu_kpwhite_list(type_);
+        if (use_xpu_kp_kernel_rt) {
+          VLOG(3) << "phi xpu_kp using rt mode in static graph";
+        }
+        if (use_xpu_kp_kernel_debug) {
+          VLOG(3) << "phi xpu_kp using debug mode in static graph";
+        }
+        bool is_xpu_kp_support =
+            (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+        if (is_xpu_kp_support) {
+          auto expected_kernel_key_library_type = kernel_type_->library_type_;
+          kernel_type_->library_type_ = LibraryType::kKP;
+          VLOG(3) << "modifing XPU KP kernel in static graph: "
+                  << pt_kernel_name
+                  << ", using_kernel_key:" << *kernel_type_.get();
+          auto try_pt_kernel_key =
+              TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
+          if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name,
+                                                        try_pt_kernel_key)) {
+            kernel_type_->library_type_ = expected_kernel_key_library_type;
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is failed " << *kernel_type_.get();
+          } else {
+            use_phi_xpu_kp = true;
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is succeed " << *kernel_type_.get();
+          }
+        }
+      }
+#endif
       pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
       pt_kernel_.reset(
           new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
@@ -1310,9 +1356,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
     } else {
       pt_kernel_name = kernel_signature_->name;
-// NOTE(Liu-xiandong): The register kernel used KP have library_type[KP],
-// But the default library_type is Plain, so we need to modify the
-// library_type here, otherwise it can't work.
+// NOTE(Liu-xiandong):In my ctest, this branch do not be executed,
+// I can't understand it, it's really confusing.
+// But we still need to keep this to avoid errors.
 #ifdef PADDLE_WITH_XPU_KP
       if (paddle::platform::is_xpu_place(kernel_type_->place_)) {
         bool use_xpu_kp_kernel_rt =
@@ -1331,15 +1377,20 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         if (is_xpu_kp_support) {
           auto expected_kernel_key_library_type = kernel_type_->library_type_;
           kernel_type_->library_type_ = LibraryType::kKP;
-          VLOG(3) << "modifing XPU KP kernel in static graph: " << type_
+          VLOG(3) << "modifing XPU KP kernel in static graph: "
+                  << pt_kernel_name
                   << ", using_kernel_key:" << *kernel_type_.get();
           auto try_pt_kernel_key =
               TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
           if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name,
                                                         try_pt_kernel_key)) {
             kernel_type_->library_type_ = expected_kernel_key_library_type;
-            VLOG(3) << "modify XPU KP kernel in static graph: " << type_
-                    << " is failed " << *kernel_type_.get();
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is failed " << *kernel_type_.get();
+          } else {
+            use_phi_xpu_kp = true;
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is succeed " << *kernel_type_.get();
           }
         }
       }
@@ -1356,11 +1407,25 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
             !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) ||
         paddle::platform::is_in_xpu_black_list(type_);
 #endif
+#ifdef PADDLE_WITH_XPU_KP
+    bool use_xpu_kp_kernel_rt =
+        paddle::platform::is_xpu_place(kernel_type_->place_) &&
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_xpu_place(kernel_type_->place_) &&
+        paddle::platform::is_in_xpu_kpwhite_list(type_);
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+#endif
+
     if (pt_kernel_->IsValid()
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         && !is_xpu_unsupport
 #endif
-        ) {
+#if defined(PADDLE_WITH_XPU_KP)
+        && (!is_xpu_unsupport || use_phi_xpu_kp)
+#endif
+            ) {
       run_phi_kernel_ = true;
     } else {
       auto& all_op_kernels = AllOpKernels();
@@ -1370,15 +1435,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 // we need to select the heterogeneous kernel in fluid, but the kernel
 // registered in KP use library_type[KP], we need to modify it.
 #ifdef PADDLE_WITH_XPU_KP
-      bool use_xpu_kp_kernel_rt =
-          paddle::platform::is_xpu_place(kernel_type_->place_) &&
-          FLAGS_run_kp_kernel &&
-          paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
-      bool use_xpu_kp_kernel_debug =
-          paddle::platform::is_xpu_place(kernel_type_->place_) &&
-          paddle::platform::is_in_xpu_kpwhite_list(type_);
-      bool is_xpu_kp_support =
-          (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
       if (is_xpu_kp_support) {
         kernel_type_->library_type_ = LibraryType::kKP;
       }
@@ -1605,7 +1661,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
        paddle::platform::is_in_xpu_black_list(type_))) {
-    VLOG(3) << "missing XPU kernel: " << type_
+    VLOG(3) << "fluid missing XPU kernel: " << type_
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
@@ -1621,10 +1677,10 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     bool use_xpu_kp_kernel_debug =
         paddle::platform::is_in_xpu_kpwhite_list(type_);
     if (use_xpu_kp_kernel_rt) {
-      VLOG(3) << "xpu_kp using rt mode ";
+      VLOG(3) << "fluid xpu_kp using rt mode ";
     }
     if (use_xpu_kp_kernel_debug) {
-      VLOG(3) << "xpu_kp using debug mode ";
+      VLOG(3) << "fluid xpu_kp using debug mode ";
     }
     bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
     if (is_xpu_kp_support) {
@@ -1641,7 +1697,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
         expected_kernel_key.place_ = platform::CPUPlace();
         kernel_iter = kernels.find(expected_kernel_key);
       } else {
-        VLOG(3) << "using XPU KP kernel: " << type_
+        VLOG(3) << "fluid using XPU KP kernel: " << type_
                 << ", using_kernel_key:" << expected_kernel_key;
       }
     }
@@ -1650,7 +1706,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
          paddle::platform::is_in_xpu_black_list(type_));
     if (!is_xpu_kp_support &&
         (kernel_iter == kernels.end() || is_xpu_unsupport)) {
-      VLOG(3) << "missing XPU kernel: " << type_
+      VLOG(3) << "fluid missing XPU kernel: " << type_
               << ", expected_kernel_key:" << expected_kernel_key
               << ", fallbacking to CPU one!";
       expected_kernel_key.place_ = platform::CPUPlace();
@@ -2340,7 +2396,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
         tensor_in = &(var->Get<phi::SelectedRows>());
         pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var->IsType<framework::LoDTensorArray>()) {
-        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        paddle::small_vector<const phi::TensorBase*> tensor_vector;
         auto& tensor_array = var->Get<framework::LoDTensorArray>();
         for (auto& t : tensor_array) {
           tensor_vector.emplace_back(&t);
@@ -2389,7 +2445,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
           tensor_out = var->template GetMutable<phi::SelectedRows>();
           pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<framework::LoDTensorArray>()) {
-          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          paddle::small_vector<phi::TensorBase*> tensor_vector;
           auto* tensor_array =
               var->template GetMutable<framework::LoDTensorArray>();
           // Note: If the input LoDTensorArray size is 0, the output
@@ -2413,185 +2469,210 @@ void OperatorWithKernel::BuildPhiKernelContext(
   VLOG(4) << "Done outputs";
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
-    if (attr_defs[i].type_index == std::type_index(typeid(phi::IntArray))) {
-      auto attr_iter = Attrs().find(attr_names[i]);
-      if (attr_iter != Attrs().end()) {  // shape is in the attribute
-        if (std::type_index(attr_iter->second.type()) ==
-            std::type_index(typeid(std::vector<int64_t>))) {
-          pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
-              BOOST_GET_CONST(std::vector<int64_t>, attr_iter->second))));
-        } else if (std::type_index(attr_iter->second.type()) ==
-                   std::type_index(typeid(std::vector<int32_t>))) {
-          pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
-              BOOST_GET_CONST(std::vector<int32_t>, attr_iter->second))));
-        } else if (std::type_index(attr_iter->second.type()) ==
-                   std::type_index(typeid(int32_t))) {
-          pt_kernel_context->EmplaceBackAttr(std::move(
-              phi::IntArray(&BOOST_GET_CONST(int32_t, attr_iter->second), 1)));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to IntArray when "
-              "construct KernelContext.",
-              attr_names[i]));
-        }
-      } else {  // shape is in the input
-        auto& ins_vector = ctx.inputs.at(attr_names[i]);
-        if (ins_vector.size() == 1) {  // ShapeTensor
+    VLOG(6) << "BuildPhiKernelContext: " << attr_names[i] << ": "
+            << attr_defs[i].type_index;
+    auto attr_iter = Attrs().find(attr_names[i]);
+    switch (attr_defs[i].type_index) {
+      case phi::AttributeType::SCALAR:
+        if (attr_iter != Attrs().end()) {
+          // scalar is in the attribute
+          switch (AttrTypeID(attr_iter->second)) {
+            case proto::AttrType::FLOAT:
+              pt_kernel_context->EmplaceBackAttr(std::move(
+                  phi::Scalar(BOOST_GET_CONST(float, attr_iter->second))));
+              break;
+            case proto::AttrType::INT:
+              pt_kernel_context->EmplaceBackAttr(std::move(
+                  phi::Scalar(BOOST_GET_CONST(int, attr_iter->second))));
+              break;
+            case proto::AttrType::STRING:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::Scalar(
+                  BOOST_GET_CONST(std::string, attr_iter->second))));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to Scalar when construct "
+                  "KernelContext in dygraph.",
+                  attr_names[i]));
+          }
+        } else {  // scalar is in the input
+          auto& ins_vector = ctx.inputs.at(attr_names[i]);
           pt_kernel_context->EmplaceBackAttr(std::move(
-              experimental::MakePhiIntArrayFromVar(*ins_vector.front())));
-        } else {  // ShapeTensorList
-          pt_kernel_context->EmplaceBackAttr(
-              std::move(experimental::MakePhiIntArrayFromVarList(ins_vector)));
+              experimental::MakePhiScalarFromVar(*ins_vector.front())));
         }
-      }
-    } else if (attr_defs[i].type_index ==
-               std::type_index(typeid(phi::Scalar))) {
-      // TODO(chenweihang): support other attrs later
-      // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
-      // attribtue type by attr_defs
-      auto attr_iter = Attrs().find(attr_names[i]);
-      if (attr_iter != Attrs().end()) {  // scalar is in the attribute
-        auto& attr = Attrs().at(attr_names[i]);
-        if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
-          pt_kernel_context->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(float, attr))));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(std::string))) {
-          pt_kernel_context->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(std::string, attr))));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(int))) {
-          pt_kernel_context->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(int, attr))));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to Scalar when construct "
-              "KernelContext.",
-              attr_names[i]));
-        }
-      } else {
-        auto& ins_vector = ctx.inputs.at(attr_names[i]);
-        pt_kernel_context->EmplaceBackAttr(
-            std::move(experimental::MakePhiScalarFromVar(*ins_vector.front())));
-      }
-
-    } else if (attr_defs[i].type_index ==
-               std::type_index(typeid(std::vector<phi::Scalar>))) {
-      auto& attr = Attrs().at(attr_names[i]);
-      if (std::type_index(attr.type()) ==
-          std::type_index(typeid(std::vector<int32_t>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<int64_t>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<float>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<double>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
+        break;
+      case phi::AttributeType::INT_ARRAY:
+        if (attr_iter != Attrs().end()) {
+          switch (AttrTypeID(attr_iter->second)) {
+            case proto::AttrType::INTS:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
+                  BOOST_GET_CONST(std::vector<int32_t>, attr_iter->second))));
+              break;
+            case proto::AttrType::LONGS:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
+                  BOOST_GET_CONST(std::vector<int64_t>, attr_iter->second))));
+              break;
+            case proto::AttrType::INT:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
+                  &BOOST_GET_CONST(int32_t, attr_iter->second), 1)));
+              break;
+            case proto::AttrType::LONG:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
+                  &BOOST_GET_CONST(int64_t, attr_iter->second), 1)));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to IntArray when "
+                  "construct KernelContext.",
+                  attr_names[i]));
+          }
+        } else {  // shape is in the input
+          auto& ins_vector = ctx.inputs.at(attr_names[i]);
+          if (ins_vector.size() == 1) {  // ShapeTensor
+            pt_kernel_context->EmplaceBackAttr(std::move(
+                experimental::MakePhiIntArrayFromVar(*ins_vector.front())));
+          } else {  // ShapeTensorList
+            pt_kernel_context->EmplaceBackAttr(std::move(
+                experimental::MakePhiIntArrayFromVarList(ins_vector)));
+          }
         }
-        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` to vector<Scalar> when "
-            "construct KernelContext.",
-            attr_names[i]));
-      }
-    } else {
-      // TODO(chenweihang): support other attrs later
-      auto attr_it = attrs_.find(attr_names[i]);
-      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        if (attr_it == attrs_.end()) {
-          auto in_it = ctx.inputs.find(attr_names[i]);
-          if (in_it != ctx.inputs.end()) {
-            // get data from input
-            auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0]));
-            int32_t val_int = val.template to<int32_t>();
-            pt_kernel_context->EmplaceBackAttr(val_int);
-          } else {
-            PADDLE_THROW(platform::errors::NotFound(
-                "can not find attribute `%s` both in attribute and input ",
+        break;
+      case phi::AttributeType::SCALARS: {
+        PADDLE_ENFORCE_NE(
+            attr_iter, Attrs().end(),
+            platform::errors::NotFound("(%s) is not found in AttributeMap when "
+                                       "buildind static KernelContext.",
+                                       attr_names[i]));
+        switch (AttrTypeID(attr_iter->second)) {
+          case proto::AttrType::INTS: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<int32_t>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case proto::AttrType::LONGS: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<int64_t>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case proto::AttrType::FLOATS: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<float>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case proto::AttrType::FLOAT64S: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<double>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case proto::AttrType::BOOLEANS: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<bool>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          default:
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported cast op attribute `%s` to vector<Scalar> when "
+                "construct KernelContext.",
                 attr_names[i]));
-          }
-        } else {
-          pt_kernel_context->EmplaceBackAttr(
-              BOOST_GET_CONST(int, attr_it->second));
         }
-      } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(float, attr_it->second));
-      } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(bool, attr_it->second));
-      } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(int64_t, attr_it->second));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::string))) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(std::string, attr_it->second));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(phi::DataType))) {
-        auto data_type = paddle::framework::TransToPhiDataType(
-            static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr_it->second)));
-        pt_kernel_context->EmplaceBackAttr(data_type);
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<int64_t>))) {
-        if (std::type_index(attr_it->second.type()) ==
-            std::type_index(typeid(std::vector<int64_t>))) {
-          pt_kernel_context->EmplaceBackAttr(
-              BOOST_GET_CONST(std::vector<int64_t>, attr_it->second));
-        } else if (std::type_index(attr_it->second.type()) ==
-                   std::type_index(typeid(std::vector<int>))) {
-          // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr =
-              BOOST_GET_CONST(std::vector<int>, attr_it->second);
-          const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
-                                                       vector_int_attr.end());
-          pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
+      } break;
+      default: {
+        PADDLE_ENFORCE_NE(
+            attr_iter, Attrs().end(),
+            platform::errors::NotFound("(%s) is not found in AttributeMap when "
+                                       "buildind static KernelContext.",
+                                       attr_names[i]));
+        switch (attr_defs[i].type_index) {
+          case phi::AttributeType::FLOAT32:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(float, attr_iter->second));
+            break;
+          case phi::AttributeType::INT32:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(int, attr_iter->second));
+            break;
+          case phi::AttributeType::BOOL:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(bool, attr_iter->second));
+            break;
+          case phi::AttributeType::INT64:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(int64_t, attr_iter->second));
+            break;
+          case phi::AttributeType::INT32S:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<int>, attr_iter->second));
+            break;
+          case phi::AttributeType::DATA_TYPE: {
+            auto data_type = framework::TransToPhiDataType(
+                static_cast<framework::proto::VarType::Type>(
+                    BOOST_GET_CONST(int, attr_iter->second)));
+            pt_kernel_context->EmplaceBackAttr(data_type);
+          } break;
+          case phi::AttributeType::STRING:
+            pt_kernel_context->EmplaceBackAttr(
+                std::move(BOOST_GET_CONST(std::string, attr_iter->second)));
+            break;
+          case phi::AttributeType::INT64S:
+            switch (AttrTypeID(attr_iter->second)) {
+              case proto::AttrType::LONGS:
+                pt_kernel_context->EmplaceBackAttr(
+                    BOOST_GET_CONST(std::vector<int64_t>, attr_iter->second));
+                break;
+              case proto::AttrType::INTS: {
+                const auto& vector_int_attr =
+                    BOOST_GET_CONST(std::vector<int>, attr_iter->second);
+                const std::vector<int64_t> vector_int64_attr(
+                    vector_int_attr.begin(), vector_int_attr.end());
+                pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
+              } break;
+              default:
+                PADDLE_THROW(platform::errors::Unimplemented(
+                    "Unsupported cast op attribute `%s` to vector<int64_t> "
+                    "when "
+                    "construct KernelContext.",
+                    attr_names[i]));
+            }
+            break;
+          case phi::AttributeType::FLOAT32S:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<float>, attr_iter->second));
+            break;
+          case phi::AttributeType::STRINGS:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<std::string>, attr_iter->second));
+            break;
+          default:
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported cast op attribute `%s` when construct "
+                "KernelContext in dygraph.",
+                attr_names[i]));
         }
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<int32_t>))) {
-        const auto& vector_int_attr =
-            BOOST_GET_CONST(std::vector<int>, attr_it->second);
-        pt_kernel_context->EmplaceBackAttr(vector_int_attr);
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<std::string>))) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<std::string>, attr_it->second));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<float>))) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<float>, attr_it->second));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` when construct "
-            "KernelContext.",
-            attr_names[i]));
       }
     }
   }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 70e9f5c1b1457..2e00e07535b1d 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -333,8 +333,8 @@ class ExecutionContext {
     return it->second;
   }
 
-  virtual paddle::SmallVector<const std::string*> InNameList() const {
-    paddle::SmallVector<const std::string*> vec_temp;
+  virtual paddle::small_vector<const std::string*> InNameList() const {
+    paddle::small_vector<const std::string*> vec_temp;
     vec_temp.reserve(ctx_.inputs.size());
 
     for (auto& input : ctx_.inputs) {
@@ -479,6 +479,11 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   bool IsDenseTensorInput(const std::string& name) const override {
+    const auto* var = ctx_.InputVar(name);
+    return var->IsType<phi::DenseTensor>();
+  }
+
+  bool IsDenseTensorInputs(const std::string& name) const override {
     auto vars = ctx_.MultiInputVar(name);
     return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
       return var->IsType<phi::DenseTensor>();
@@ -486,10 +491,8 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   bool IsSelectedRowsInput(const std::string& name) const override {
-    auto vars = ctx_.MultiInputVar(name);
-    return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
-      return var->IsType<phi::SelectedRows>();
-    });
+    const auto* var = ctx_.InputVar(name);
+    return var->IsType<phi::SelectedRows>();
   }
 
   bool IsDenseTensorVectorInput(const std::string& name) const override {
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index fe7c56827612c..3eda00006f959 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -41,9 +41,9 @@ class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker {
 
   ~KernelArgsNameMakerByOpProto() {}
 
-  const paddle::SmallVector<const char*>& GetInputArgsNames() override;
-  const paddle::SmallVector<const char*>& GetOutputArgsNames() override;
-  const paddle::SmallVector<const char*>& GetAttrsArgsNames() override;
+  const paddle::small_vector<const char*>& GetInputArgsNames() override;
+  const paddle::small_vector<const char*>& GetOutputArgsNames() override;
+  const paddle::small_vector<const char*>& GetAttrsArgsNames() override;
 
   phi::KernelSignature GetKernelSignature();
 
@@ -53,9 +53,9 @@ class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker {
  private:
   const framework::proto::OpProto* op_proto_;
 
-  paddle::SmallVector<const char*> input_names_;
-  paddle::SmallVector<const char*> output_names_;
-  paddle::SmallVector<const char*> attr_names_;
+  paddle::small_vector<const char*> input_names_;
+  paddle::small_vector<const char*> output_names_;
+  paddle::small_vector<const char*> attr_names_;
 };
 
 OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
@@ -81,19 +81,21 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
 phi::KernelKey TransOpKernelTypeToPhiKernelKey(
     const OpKernelType& kernel_type) {
   phi::Backend backend = phi::TransToPhiBackend(kernel_type.place_);
-  if (kernel_type.library_type_ == LibraryType::kMKLDNN) {
-    backend = phi::Backend::MKLDNN;
-  } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
-    backend = phi::Backend::GPUDNN;
-  } else if (kernel_type.library_type_ == LibraryType::kKP) {
-    backend = phi::Backend::KPS;
-  } else {
-    // do nothing
+  switch (kernel_type.library_type_) {
+    case LibraryType::kCUDNN:
+      backend = phi::Backend::GPUDNN;
+      break;
+    case LibraryType::kMKLDNN:
+      backend = phi::Backend::MKLDNN;
+      break;
+    case LibraryType::kKP:
+      backend = phi::Backend::KPS;
+      break;
+    default:
+      break;
   }
-  paddle::experimental::DataLayout layout = kernel_type.data_layout_;
-  paddle::experimental::DataType dtype =
-      paddle::framework::TransToPhiDataType(kernel_type.data_type_);
-  return phi::KernelKey(backend, layout, dtype);
+  return phi::KernelKey(backend, kernel_type.data_layout_,
+                        framework::TransToPhiDataType(kernel_type.data_type_));
 }
 
 phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
@@ -149,7 +151,7 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
   return phi::KernelKey();
 }
 
-const paddle::SmallVector<const char*>&
+const paddle::small_vector<const char*>&
 KernelArgsNameMakerByOpProto::GetInputArgsNames() {
   for (int i = 0; i < op_proto_->inputs_size(); ++i) {
     auto& in = op_proto_->inputs()[i];
@@ -174,7 +176,7 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
   return input_names_;
 }
 
-const paddle::SmallVector<const char*>&
+const paddle::small_vector<const char*>&
 KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
   for (int i = 0; i < op_proto_->outputs_size(); ++i) {
     auto& out = op_proto_->outputs()[i];
@@ -194,7 +196,7 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
   return output_names_;
 }
 
-const paddle::SmallVector<const char*>&
+const paddle::small_vector<const char*>&
 KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
   for (int i = 0; i < op_proto_->attrs_size(); ++i) {
     auto& attr = op_proto_->attrs()[i];
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index a99abbf0cebbb..785ede5c60175 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -53,9 +53,9 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
 class KernelArgsNameMaker {
  public:
   virtual ~KernelArgsNameMaker() {}
-  virtual const paddle::SmallVector<const char*>& GetInputArgsNames() = 0;
-  virtual const paddle::SmallVector<const char*>& GetOutputArgsNames() = 0;
-  virtual const paddle::SmallVector<const char*>& GetAttrsArgsNames() = 0;
+  virtual const paddle::small_vector<const char*>& GetInputArgsNames() = 0;
+  virtual const paddle::small_vector<const char*>& GetOutputArgsNames() = 0;
+  virtual const paddle::small_vector<const char*>& GetAttrsArgsNames() = 0;
 };
 
 void InitDefaultKernelSignatureMap();
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index e4004c2fbf3b5..9b12870a2bb9b 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -23,7 +23,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+     defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index 452c960166cb2..ad1ddbfabd091 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -18,7 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/lodtensor_printer.h"
 #include "paddle/fluid/string/string_helper.h"
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+     defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -132,8 +133,11 @@ void PSGPUWorker::TrainFiles() {
   device_reader_->Start();
   int cur_batch;
   int batch_cnt = 0;
-
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   platform::SetDeviceId(thread_id_);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  platform::SetXPUDeviceId(thread_id_);
+#endif
   while ((cur_batch = device_reader_->Next()) > 0) {
     total_ins_num += cur_batch;
     for (auto& op : ops_) {
@@ -230,7 +234,11 @@ void PSGPUWorker::TrainFilesWithProfiler() {
   int total_ins_num = 0;
   int cur_batch;
   timeline.Start();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   platform::SetDeviceId(thread_id_);
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  platform::SetXPUDeviceId(thread_id_);
+#endif
   while ((cur_batch = device_reader_->Next()) > 0) {
     total_ins_num += cur_batch;
     timeline.Pause();
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 4600213596e62..44f0ce0165c5b 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -65,6 +65,8 @@ class InferShapeContext {
   virtual bool HasOutput(const std::string &name) const = 0;
   virtual bool HasAttr(const std::string &name) const = 0;
 
+  virtual proto::VarType::Type GetInputVarType(
+      const std::string &name) const = 0;
   virtual std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string &name) const = 0;
   virtual std::vector<proto::VarType::Type> GetOutputsVarType(
@@ -108,9 +110,9 @@ class InferShapeContext {
 
   virtual bool IsRunMKLDNNKernel() const = 0;
 
-  virtual paddle::SmallVector<InferShapeVarPtr, phi::kInputSmallVectorSize>
+  virtual paddle::small_vector<InferShapeVarPtr, phi::kInputSmallVectorSize>
   GetInputVarPtrs(const std::string &name) const = 0;
-  virtual paddle::SmallVector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
+  virtual paddle::small_vector<InferShapeVarPtr, phi::kOutputSmallVectorSize>
   GetOutputVarPtrs(const std::string &name) const = 0;
 
   virtual const phi::ArgumentMappingFn *GetPhiArgumentMappingFn() const = 0;
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 8a11775702e57..2496d4d040e2e 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -248,7 +248,8 @@ class HeterXpuTrainer : public TrainerBase {
 
 #endif
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+     defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 class PSGPUTrainer : public TrainerBase {
  public:
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index f189d0213da88..1f1122d32f5c3 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -76,7 +76,8 @@ REGISTER_TRAINER_CLASS(HeterPipelineTrainer);
     (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS))
 REGISTER_TRAINER_CLASS(HeterXpuTrainer);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+     defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(PSGPUTrainer);
 #endif
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 7d60b7d26f3fb..3f6863d642cc8 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -220,6 +220,7 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
       paddle::platform::is_cuda_pinned_place(place) ||
       paddle::platform::is_xpu_place(place) ||
       paddle::platform::is_mlu_place(place) ||
+      paddle::platform::is_custom_place(place) ||
       paddle::platform::is_npu_place(place) ||
       paddle::platform::is_npu_pinned_place(place)) {
     // CudaPinndePlace is added for varbase created by dataloader
diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h
index 330a5a0cfa90e..124c31df73349 100644
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -117,8 +117,8 @@ class DygraphExecutionContext : public framework::ExecutionContext {
     return it->second;
   }
 
-  paddle::SmallVector<const std::string*> InNameList() const override {
-    paddle::SmallVector<const std::string*> vec_temp;
+  paddle::small_vector<const std::string*> InNameList() const override {
+    paddle::small_vector<const std::string*> vec_temp;
     vec_temp.reserve(var_map_in_.size());
 
     for (auto& v : var_map_in_) {
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index 8a5d942e059c0..b5df973869a9f 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -239,9 +239,10 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
             (op_kernel_type_->data_layout_ == framework::DataLayout::kMKLDNN));
   }
 
-  paddle::SmallVector<framework::InferShapeVarPtr, phi::kInputSmallVectorSize>
+  paddle::small_vector<framework::InferShapeVarPtr, phi::kInputSmallVectorSize>
   GetInputVarPtrs(const std::string& name) const override {
-    paddle::SmallVector<framework::InferShapeVarPtr, phi::kInputSmallVectorSize>
+    paddle::small_vector<framework::InferShapeVarPtr,
+                         phi::kInputSmallVectorSize>
         res;
     auto it = var_map_in_->find(name);
     PADDLE_ENFORCE_NE(
@@ -253,10 +254,10 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
     return res;
   }
 
-  paddle::SmallVector<framework::InferShapeVarPtr, phi::kOutputSmallVectorSize>
+  paddle::small_vector<framework::InferShapeVarPtr, phi::kOutputSmallVectorSize>
   GetOutputVarPtrs(const std::string& name) const override {
-    paddle::SmallVector<framework::InferShapeVarPtr,
-                        phi::kOutputSmallVectorSize>
+    paddle::small_vector<framework::InferShapeVarPtr,
+                         phi::kOutputSmallVectorSize>
         res;
     auto it = var_map_out_->find(name);
     PADDLE_ENFORCE_NE(
@@ -300,6 +301,15 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
     return vec_res;
   }
 
+  framework::proto::VarType::Type GetInputVarType(
+      const std::string& name) const override {
+    auto it = var_map_in_->find(name);
+    PADDLE_ENFORCE_NE(
+        it, var_map_in_->end(),
+        platform::errors::NotFound("can not find [%s] in input", name));
+    return framework::ToVarType(it->second[0]->Var().Type());
+  }
+
   std::vector<framework::proto::VarType::Type> GetInputsVarType(
       const std::string& name) const override {
     std::vector<framework::proto::VarType::Type> vec_res;
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 038ea575247d5..e928cbb654839 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -459,7 +459,7 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
                           const framework::AttributeMap& attrs,
                           const framework::AttributeMap& default_attrs,
                           const platform::Place& place) {
-  auto* op_kernel = dynamic_cast<const framework::OperatorWithKernel*>(&op);
+  auto* op_kernel = static_cast<const framework::OperatorWithKernel*>(&op);
   PADDLE_ENFORCE_NOT_NULL(
       op_kernel, platform::errors::PermissionDenied(
                      "Only support operator with kernel in Dygraph mode."));
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 6c056605faa48..38180ba963c38 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -40,6 +40,13 @@ static const phi::Kernel empty_kernel;
 static const framework::RuntimeContext empty_ctx({}, {});
 static const framework::Scope empty_scope;
 
+const phi::KernelFactory& PreparedOp::phi_kernel_factory =
+    phi::KernelFactory::Instance();
+const phi::OpUtilsMap& PreparedOp::phi_op_utils_map =
+    phi::OpUtilsMap::Instance();
+const phi::DefaultKernelSignatureMap& PreparedOp::default_phi_kernel_sig_map =
+    phi::DefaultKernelSignatureMap::Instance();
+
 const std::shared_ptr<VariableWrapper>& GetVariableWrapper(
     const std::shared_ptr<paddle::imperative::VarBase>& var) {
   return var->SharedVar();
@@ -139,12 +146,14 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       phi_kernel_(phi_kernel) {}
 
 template <typename VarType>
-PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
-                       const NameVarMap<VarType>& outs,
-                       const framework::OperatorWithKernel& op,
-                       const platform::Place& place,
-                       const framework::AttributeMap& attrs,
-                       const framework::AttributeMap& default_attrs) {
+PreparedOp PrepareImpl(
+    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
+    const framework::OperatorWithKernel& op, const platform::Place& place,
+    const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs,
+    const phi::KernelFactory& phi_kernel_factory,
+    const phi::OpUtilsMap& phi_op_utils_map,
+    const phi::DefaultKernelSignatureMap& default_phi_kernel_sig_map) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -184,15 +193,15 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
   bool has_phi_kernel = false;
 
-  const auto* arg_map_fn =
-      phi::OpUtilsMap::Instance().GetArgumentMappingFn(op.Type());
+  const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type());
+
   if (arg_map_fn) {
     has_phi_kernel = true;
     kernel_signature = (*arg_map_fn)(
         framework::ExecutionArgumentMappingContext(dygraph_exe_ctx));
   } else {
     default_kernel_signature =
-        phi::DefaultKernelSignatureMap::Instance().GetNullable(op.Type());
+        default_phi_kernel_sig_map.GetNullable(op.Type());
     if (default_kernel_signature) {
       has_phi_kernel = true;
       kernel_signature = *default_kernel_signature;
@@ -224,23 +233,26 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
         auto expected_kernel_key_library_type =
             expected_kernel_key.library_type_;
         expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
-        VLOG(3) << "modifing XPU KP kernel: " << op.Type()
+        VLOG(3) << "modifing XPU KP kernel: " << pt_kernel_name
                 << ", using_kernel_key:" << expected_kernel_key;
+
         phi::KernelKey try_pt_kernel_key =
             TransOpKernelTypeToPhiKernelKey(expected_kernel_key);
-        if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name,
-                                                      try_pt_kernel_key)) {
+        if (!phi_kernel_factory.HasKernel(pt_kernel_name, try_pt_kernel_key)) {
           expected_kernel_key.library_type_ = expected_kernel_key_library_type;
-          VLOG(3) << "modify XPU KP kernel: " << op.Type() << " is failed "
-                  << expected_kernel_key;
+          VLOG(3) << "modify XPU KP kernel: " << pt_kernel_name
+                  << " in dynamic graph is failed " << expected_kernel_key;
+        } else {
+          VLOG(3) << "modify XPU KP kernel: " << pt_kernel_name
+                  << " in dynamic graph is succeed " << expected_kernel_key;
         }
       }
     }
 #endif
 
     pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key);
-    auto& phi_kernel = phi::KernelFactory::Instance().SelectKernel(
-        pt_kernel_name, pt_kernel_key);
+    auto& phi_kernel =
+        phi_kernel_factory.SelectKernel(pt_kernel_name, pt_kernel_key);
 
     if (phi_kernel.IsValid()
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
@@ -295,11 +307,11 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
       || (is_xpu_unsupport && !is_xpu_kp_support)
 #endif
           ) {
-    if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) {
+    if (has_phi_kernel) {
       auto pt_cpu_kernel_key =
           FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
-      auto& pt_cpu_kernel = phi::KernelFactory::Instance().SelectKernel(
-          pt_kernel_name, pt_cpu_kernel_key);
+      auto& pt_cpu_kernel =
+          phi_kernel_factory.SelectKernel(pt_kernel_name, pt_cpu_kernel_key);
       if (pt_cpu_kernel.IsValid()) {
         VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
                 << " | kernel key: " << pt_cpu_kernel_key
@@ -324,7 +336,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() || is_xpu_unsupport)) {
-    VLOG(3) << "missing XPU kernel: " << op.Type()
+    VLOG(3) << "fluid missing XPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
@@ -335,20 +347,20 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #ifdef PADDLE_WITH_XPU_KP
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
     if (use_xpu_kp_kernel_rt) {
-      VLOG(3) << "xpu_kp using rt mode ";
+      VLOG(3) << "fluid xpu_kp using rt mode ";
     }
     if (use_xpu_kp_kernel_debug) {
-      VLOG(3) << "xpu_kp using debug mode ";
+      VLOG(3) << "fluid xpu_kp using debug mode ";
     }
     if (is_xpu_kp_support) {
       expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
       kernel_iter = kernels.find(expected_kernel_key);
-      VLOG(3) << "using XPU KP kernel: " << op.Type()
+      VLOG(3) << "using fluid XPU KP kernel: " << op.Type()
               << ", using_kernel_key:" << expected_kernel_key;
     }
     if (!is_xpu_kp_support &&
         (kernel_iter == kernels.end() || is_xpu_unsupport)) {
-      VLOG(3) << "missing XPU kernel: " << op.Type()
+      VLOG(3) << "fluid missing XPU kernel: " << op.Type()
               << ", expected_kernel_key:" << expected_kernel_key
               << ", fallbacking to CPU one!";
       expected_kernel_key.place_ = platform::CPUPlace();
@@ -408,7 +420,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
                                const framework::AttributeMap& default_attrs) {
-  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs);
+  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs,
+                              phi_kernel_factory, phi_op_utils_map,
+                              default_phi_kernel_sig_map);
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
@@ -417,8 +431,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
                                const framework::AttributeMap& default_attrs) {
-  return PrepareImpl<VariableWrapper>(ins, outs, op, place, attrs,
-                                      default_attrs);
+  return PrepareImpl<VariableWrapper>(
+      ins, outs, op, place, attrs, default_attrs, phi_kernel_factory,
+      phi_op_utils_map, default_phi_kernel_sig_map);
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<egr::EagerVariable>& ins,
@@ -427,8 +442,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<egr::EagerVariable>& ins,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
                                const framework::AttributeMap& default_attrs) {
-  return PrepareImpl<egr::EagerVariable>(ins, outs, op, place, attrs,
-                                         default_attrs);
+  return PrepareImpl<egr::EagerVariable>(
+      ins, outs, op, place, attrs, default_attrs, phi_kernel_factory,
+      phi_op_utils_map, default_phi_kernel_sig_map);
 }
 template <typename VarType>
 static void PreparedOpRunImpl(
@@ -441,10 +457,9 @@ static void PreparedOpRunImpl(
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
   // TODO(zjl): remove scope in dygraph
-  framework::Scope scope;
 
   {
-    platform::RecordEvent record_event(op.Type() + "::infer_shape",
+    platform::RecordEvent record_event("infer_shape",
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
     DygraphInferShapeContext<VarType> infer_shape_ctx(
@@ -454,12 +469,12 @@ static void PreparedOpRunImpl(
   }
 
   {
-    platform::RecordEvent record_event(op.Type() + "::compute",
+    platform::RecordEvent record_event("compute",
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
 
-    func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
-                                          attrs, default_attrs));
+    func(DygraphExecutionContext<VarType>(op, empty_scope, *dev_ctx, ctx, ins,
+                                          outs, attrs, default_attrs));
   }
 
   if (FLAGS_check_nan_inf) {
@@ -503,7 +518,7 @@ static void PreparedOpRunPtImpl(
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
   {
-    platform::RecordEvent record_event(op.Type() + "::infer_shape",
+    platform::RecordEvent record_event("infer_shape",
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
     DygraphInferShapeContext<VarType> infer_shape_ctx(
@@ -513,7 +528,7 @@ static void PreparedOpRunPtImpl(
   }
 
   {
-    platform::RecordEvent record_event(op.Type() + "::compute",
+    platform::RecordEvent record_event("compute",
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
 
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index dedb6a382efa6..129f75e75de1e 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -214,9 +214,13 @@ class PreparedOp {
   const phi::KernelSignature* default_kernel_signature_;
   phi::KernelSignature kernel_signature_;
   const phi::Kernel& phi_kernel_;
+
+  static const phi::KernelFactory& phi_kernel_factory;
+  static const phi::OpUtilsMap& phi_op_utils_map;
+  static const phi::DefaultKernelSignatureMap& default_phi_kernel_sig_map;
 };
 
-const inline framework::Attribute& GetAttr(
+const inline framework::Attribute* GetAttr(
     const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs, const std::string& name) {
   auto it = attrs.find(name);
@@ -225,10 +229,10 @@ const inline framework::Attribute& GetAttr(
     it = default_attrs.find(name);
     found = it != default_attrs.end();
   }
-  PADDLE_ENFORCE_EQ(
-      found, true,
-      platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
-  return it->second;
+  if (found) {
+    return &it->second;
+  }
+  return nullptr;
 }
 
 template <typename VarType>
@@ -311,7 +315,7 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
         tensor_in = &(var.template Get<phi::SelectedRows>());
         kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var.template IsType<framework::LoDTensorArray>()) {
-        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        paddle::small_vector<const phi::TensorBase*> tensor_vector;
         auto& tensor_array = var.template Get<framework::LoDTensorArray>();
         for (auto& t : tensor_array) {
           tensor_vector.emplace_back(&t);
@@ -326,6 +330,7 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(6) << "BuildDygraphPhiKernelContext: Inputs parsing completed.";
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
@@ -357,7 +362,7 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
           tensor_out = var->template GetMutable<phi::SelectedRows>();
           kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<framework::LoDTensorArray>()) {
-          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          paddle::small_vector<phi::TensorBase*> tensor_vector;
           auto* tensor_array =
               var->template GetMutable<framework::LoDTensorArray>();
           for (auto& t : *tensor_array) {
@@ -376,203 +381,217 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(6) << "BuildDygraphPhiKernelContext: Outputs parsing completed.";
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
-    if (attr_defs[i].type_index == std::type_index(typeid(phi::IntArray))) {
-      if (attrs.find(attr_names[i]) !=
-          attrs.end()) {  // shape is in the attribute
-        auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
-        if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int64_t>))) {
-          kernel_ctx->EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(std::vector<int32_t>))) {
+    VLOG(6) << "BuildDygraphPhiKernelContext: " << attr_names[i] << ": "
+            << attr_defs[i].type_index;
+    auto* attr_ptr = GetAttr(attrs, default_attrs, attr_names[i]);
+    switch (attr_defs[i].type_index) {
+      case phi::AttributeType::SCALAR:
+        if (attr_ptr) {
+          // scalar is in the attribute
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::FLOAT:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::Scalar(BOOST_GET_CONST(float, attr))));
+              break;
+            case framework::proto::AttrType::INT:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::Scalar(BOOST_GET_CONST(int, attr))));
+              break;
+            case framework::proto::AttrType::STRING:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::Scalar(BOOST_GET_CONST(std::string, attr))));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to Scalar when construct "
+                  "KernelContext in dygraph.",
+                  attr_names[i]));
+          }
+        } else {  // scalar is in the input
+          auto& ins_vector = ins.at(attr_names[i]);
           kernel_ctx->EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(int64_t))) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::IntArray(&BOOST_GET_CONST(int64_t, attr), 1)));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(int32_t))) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::IntArray(&BOOST_GET_CONST(int32_t, attr), 1)));
-        } else if (attr_defs[i].type_index ==
-                   std::type_index(typeid(std::vector<int32_t>))) {
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
-          kernel_ctx->EmplaceBackAttr(vector_int_attr);
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to VectorTensor when "
-              "construct KernelContext.",
-              attr_names[i]));
+              experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
         }
-      } else {  // shape is in the input
-        auto& ins_vector = ins.at(attr_names[i]);
-        if (ins_vector.size() == 1) {  // ShapeTensor
-          kernel_ctx->EmplaceBackAttr(std::move(
-              experimental::MakePhiIntArrayFromVar(ins_vector[0]->Var())));
-        } else {  // ShapeTensorList
-          std::vector<framework::Variable*> variables;
-          variables.reserve(ins_vector.size());
-          for (const auto& var_base : ins_vector) {
-            variables.push_back(var_base->MutableVar());
+        break;
+      case phi::AttributeType::INT_ARRAY:
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS:
+              kernel_ctx->EmplaceBackAttr(std::move(
+                  phi::IntArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+              break;
+            case framework::proto::AttrType::LONGS:
+              kernel_ctx->EmplaceBackAttr(std::move(
+                  phi::IntArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
+              break;
+            case framework::proto::AttrType::INT:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::IntArray(&BOOST_GET_CONST(int32_t, attr), 1)));
+              break;
+            case framework::proto::AttrType::LONG:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::IntArray(&BOOST_GET_CONST(int64_t, attr), 1)));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to IntArray when "
+                  "construct KernelContext.",
+                  attr_names[i]));
+          }
+        } else {  // shape is in the input
+          auto& ins_vector = ins.at(attr_names[i]);
+          if (ins_vector.size() == 1) {  // ShapeTensor
+            kernel_ctx->EmplaceBackAttr(std::move(
+                experimental::MakePhiIntArrayFromVar(ins_vector[0]->Var())));
+          } else {  // ShapeTensorList
+            std::vector<framework::Variable*> variables;
+            variables.reserve(ins_vector.size());
+            for (const auto& var_base : ins_vector) {
+              variables.push_back(var_base->MutableVar());
+            }
+            kernel_ctx->EmplaceBackAttr(
+                std::move(experimental::MakePhiIntArrayFromVarList(variables)));
           }
-          kernel_ctx->EmplaceBackAttr(
-              std::move(experimental::MakePhiIntArrayFromVarList(variables)));
-        }
-      }
-    } else if (attr_defs[i].type_index ==
-               std::type_index(typeid(phi::Scalar))) {
-      // TODO(chenweihang): support other attrs later
-      // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
-      // attribtue type by attr_defs
-      if (attrs.find(attr_names[i]) != attrs.end() ||
-          default_attrs.find(attr_names[i]) !=
-              default_attrs.end()) {  // scalar is in the attribute
-        auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
-        if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(float, attr))));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(std::string))) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(std::string, attr))));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(int))) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(int, attr))));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to Scalar when construct "
-              "KernelContext in dygraph.",
-              attr_names[i]));
-        }
-      } else {  // scalar is in the input
-        auto& ins_vector = ins.at(attr_names[i]);
-        kernel_ctx->EmplaceBackAttr(std::move(
-            experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
-      }
-
-    } else if (ins.find(attr_names[i]) != ins.end()) {
-      // deal tensor attr here
-      auto& ins_vector = ins.at(attr_names[i]);
-      auto tensor_attr =
-          experimental::MakePhiScalarFromVar(ins_vector[0]->Var());
-      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        int val = tensor_attr.template to<int>();
-        kernel_ctx->EmplaceBackAttr(val);
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented("only support int here"));
-      }
-    } else if (attr_defs[i].type_index ==
-               std::type_index(typeid(std::vector<phi::Scalar>))) {
-      auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
-      if (std::type_index(attr.type()) ==
-          std::type_index(typeid(std::vector<int32_t>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<int64_t>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<float>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<double>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
         }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(std::vector<bool>))) {
-        const auto& vec = BOOST_GET_CONST(std::vector<bool>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
+        break;
+      case phi::AttributeType::SCALARS: {
+        PADDLE_ENFORCE_NOT_NULL(
+            attr_ptr,
+            platform::errors::NotFound("(%s) is not found in AttributeMap when "
+                                       "buildind dygraph KernelContext.",
+                                       attr_names[i]));
+        auto& attr = *attr_ptr;
+        switch (AttrTypeID(attr)) {
+          case framework::proto::AttrType::INTS: {
+            const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case framework::proto::AttrType::LONGS: {
+            const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case framework::proto::AttrType::FLOATS: {
+            const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case framework::proto::AttrType::FLOAT64S: {
+            const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case framework::proto::AttrType::BOOLEANS: {
+            const auto& vec = BOOST_GET_CONST(std::vector<bool>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          default:
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported cast op attribute `%s` to vector<Scalar> when "
+                "construct KernelContext.",
+                attr_names[i]));
         }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` to vector<Scalar> when "
-            "construct KernelContext.",
-            attr_names[i]));
-      }
-    } else {
-      // TODO(chenweihang): support other attrs later
-
-      auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
-      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
-      } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
-      } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
-      } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::string))) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(phi::DataType))) {
-        auto data_type = framework::TransToPhiDataType(
-            static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr)));
-        kernel_ctx->EmplaceBackAttr(data_type);
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<int64_t>))) {
-        if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int64_t>))) {
-          kernel_ctx->EmplaceBackAttr(
-              BOOST_GET_CONST(std::vector<int64_t>, attr));
-        } else if (std::type_index(attr.type()) ==
-                   std::type_index(typeid(std::vector<int>))) {
-          // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
-          const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
-                                                       vector_int_attr.end());
-          kernel_ctx->EmplaceBackAttr(vector_int64_attr);
+      } break;
+      default: {
+        PADDLE_ENFORCE_NOT_NULL(
+            attr_ptr,
+            platform::errors::NotFound("(%s) is not found in AttributeMap when "
+                                       "buildind dygraph KernelContext.",
+                                       attr_names[i]));
+        auto& attr = *attr_ptr;
+        switch (attr_defs[i].type_index) {
+          case phi::AttributeType::FLOAT32:
+            kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+            break;
+          case phi::AttributeType::INT32:
+            kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+            break;
+          case phi::AttributeType::BOOL:
+            kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+            break;
+          case phi::AttributeType::INT64:
+            kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
+            break;
+          case phi::AttributeType::INT32S:
+            kernel_ctx->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<int>, attr));
+            break;
+          case phi::AttributeType::DATA_TYPE: {
+            auto data_type = framework::TransToPhiDataType(
+                static_cast<framework::proto::VarType::Type>(
+                    BOOST_GET_CONST(int, attr)));
+            kernel_ctx->EmplaceBackAttr(data_type);
+          } break;
+          case phi::AttributeType::STRING:
+            kernel_ctx->EmplaceBackAttr(
+                std::move(BOOST_GET_CONST(std::string, attr)));
+            break;
+          case phi::AttributeType::INT64S: {
+            switch (AttrTypeID(attr)) {
+              case framework::proto::AttrType::LONGS:
+                kernel_ctx->EmplaceBackAttr(
+                    BOOST_GET_CONST(std::vector<int64_t>, attr));
+                break;
+              case framework::proto::AttrType::INTS: {
+                const auto& vector_int_attr =
+                    BOOST_GET_CONST(std::vector<int>, attr);
+                const std::vector<int64_t> vector_int64_attr(
+                    vector_int_attr.begin(), vector_int_attr.end());
+                kernel_ctx->EmplaceBackAttr(vector_int64_attr);
+              } break;
+              default:
+                PADDLE_THROW(platform::errors::Unimplemented(
+                    "Unsupported cast op attribute `%s` to vector<int64_t> "
+                    "when "
+                    "construct KernelContext.",
+                    attr_names[i]));
+            }
+          } break;
+          case phi::AttributeType::FLOAT32S:
+            kernel_ctx->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<float>, attr));
+            break;
+          case phi::AttributeType::STRINGS:
+            kernel_ctx->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<std::string>, attr));
+            break;
+          default:
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported cast op attribute `%s` when construct "
+                "KernelContext in dygraph.",
+                attr_names[i]));
         }
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<int>))) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<int>, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<std::string>))) {
-        kernel_ctx->EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<std::string>, attr));
-      } else if (attr_defs[i].type_index ==
-                 std::type_index(typeid(std::vector<float>))) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<float>, attr));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` when construct "
-            "KernelContext in dygraph.",
-            attr_names[i]));
       }
     }
   }
+  VLOG(6) << "BuildDygraphPhiKernelContext: Attributes parsing completed.";
 }
 
 template <typename VarType>
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 3e2e082fbaa27..7b274339e3cbe 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -192,7 +192,7 @@ void Tracer::TraceOpImpl(const std::string& type,
                          paddle::framework::AttributeMap* passed_default_attrs_,
                          bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
-      type + " trace_op", platform::TracerEventType::Operator, 1);
+      "trace_op", platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
@@ -220,30 +220,34 @@ void Tracer::TraceOpImpl(const std::string& type,
       attr_checker == nullptr ? empty_attrs_map
                               : attr_checker->GetDefaultAttrMap();
 
-  NameVarMap<VarType> new_ins = ins;
+  std::unique_ptr<NameVarMap<VarType>> ins_amp = nullptr;
   if (amp_level_ == AmpLevel::O1) {
     if (amp_dtype_ == phi::DataType::FLOAT16) {
       const auto& tracer = imperative::GetCurrentTracer();
-      new_ins =
-          imperative::AutoTuneLayout<VarType>(type, ins, outs, &attrs, tracer);
       VLOG(5) << "Float16 Auto Mixed Precision O1 run operator: " << type;
-      new_ins = AutoCastInputs<VarType>(type, new_ins);
+      ins_amp = std::make_unique<NameVarMap<VarType>>(
+          AutoCastInputs<VarType>(type, imperative::AutoTuneLayout<VarType>(
+                                            type, ins, outs, &attrs, tracer)));
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
       VLOG(5) << "BFloat16 Auto Mixed Precision O1 run operator: " << type;
-      new_ins = AutoCastBF16Inputs<VarType>(type, ins);
+      ins_amp = std::make_unique<NameVarMap<VarType>>(
+          AutoCastBF16Inputs<VarType>(type, ins));
     }
   } else if (amp_level_ == AmpLevel::O2) {
     if (amp_dtype_ == phi::DataType::FLOAT16) {
       const auto& tracer = imperative::GetCurrentTracer();
-      new_ins =
-          imperative::AutoTuneLayout<VarType>(type, ins, outs, &attrs, tracer);
       VLOG(5) << "Float16 Auto Mixed Precision O2 run operator: " << type;
-      new_ins = CastPureFp16Inputs<VarType>(type, new_ins);
+      ins_amp =
+          std::make_unique<NameVarMap<VarType>>(CastPureFp16Inputs<VarType>(
+              type, imperative::AutoTuneLayout<VarType>(type, ins, outs, &attrs,
+                                                        tracer)));
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
       VLOG(5) << "BFloat16 Auto Mixed Precision O2 run operator: " << type;
-      new_ins = CastPureBf16Inputs<VarType>(type, ins);
+      ins_amp = std::make_unique<NameVarMap<VarType>>(
+          CastPureBf16Inputs<VarType>(type, ins));
     }
   }
+  const auto& new_ins = ins_amp == nullptr ? ins : *ins_amp;
 
   try {
     if (platform::is_gpu_place(place)) {
@@ -320,7 +324,7 @@ void Tracer::TraceOpImpl(const std::string& type,
 
   {
     platform::RecordEvent node_creation_record_event(
-        type + " node_creation", platform::TracerEventType::OperatorInner, 1);
+        "grad_node_creation", platform::TracerEventType::OperatorInner, 1);
 
     if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 015f4471a0246..4f0d4a908380f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -48,6 +48,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/utils/string/split.h"
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
@@ -1641,7 +1642,9 @@ AnalysisPredictor::~AnalysisPredictor() {
     StatisticShapeRangeInfo();
   }
 
-  memory::Release(place_);
+  if (place_.GetType() != phi::AllocationType::UNDEFINED) {
+    memory::Release(place_);
+  }
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 5e1a9b85ff586..0c68acfe98047 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -674,8 +674,39 @@ void Tensor::ORTCopyFromCpu(const T *data) {
                               OrtMemTypeDefault);
   size_t size = std::accumulate(begin(shape_), end(shape_), 1UL,
                                 std::multiplies<size_t>());
-  auto ort_value = GetOrtVaule(memory_info, const_cast<T *>(data), size,
-                               shape_.data(), shape_.size());
+  size_t buffer_size = size * sizeof(T);
+  if (buffer_size > buffer_.size()) {
+    buffer_.resize(buffer_size);
+  }
+  std::memcpy(static_cast<void *>(buffer_.data()), data, buffer_size);
+
+  auto onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  if (std::is_same<T, float>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  } else if (std::is_same<T, double>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
+  } else if (std::is_same<T, int64_t>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+  } else if (std::is_same<T, int32_t>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;
+  } else if (std::is_same<T, uint8_t>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8;
+  } else if (std::is_same<T, int8_t>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
+  } else if (std::is_same<T, float16>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+  }
+
+  if (onnx_dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Found undefined data type for onnxruntime, only supports "
+        "float16/float32/float64/int8/uint8/int32/int64."));
+  }
+
+  auto ort_value =
+      Ort::Value::CreateTensor(memory_info, buffer_.data(), buffer_size,
+                               shape_.data(), shape_.size(), onnx_dtype);
+
   binding->BindInput(name_.c_str(), ort_value);
 }
 
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 6f99ed6e25a28..3cd2df3aef639 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -187,6 +187,7 @@ class PD_INFER_DECL Tensor {
 #ifdef PADDLE_WITH_ONNXRUNTIME
   bool is_ort_tensor_{false};
   std::vector<int64_t> shape_;
+  std::vector<int8_t> buffer_;
   std::weak_ptr<Ort::IoBinding> binding_;
   int idx_{-1};
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index e2649a7fd334d..35ad27f4c62b5 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -835,6 +835,16 @@ class AllocatorFacadePrivate {
       platform::MLUPlace p(i);
       system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
+    for (const auto& dev_type : device_types) {
+      for (size_t dev_id = 0;
+           dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) {
+        platform::CustomPlace p(dev_type, dev_id);
+        system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+      }
+    }
 #endif
   }
 
diff --git a/paddle/fluid/memory/cuda_managed_memory_test.cu b/paddle/fluid/memory/cuda_managed_memory_test.cu
index f8c9ff82f5712..f4b4294b5bdbf 100644
--- a/paddle/fluid/memory/cuda_managed_memory_test.cu
+++ b/paddle/fluid/memory/cuda_managed_memory_test.cu
@@ -107,7 +107,7 @@ TEST(ManagedMemoryTest, OversubscribeGPUMemoryTest) {
   uint64_t available_mem = platform::GpuAvailableMemToAlloc();
   uint64_t n_data = available_mem * 2 / sizeof(int) +
                     1;  // requires more than 2 * available_mem bytes
-  uint64_t step = 1024;
+  uint64_t step = std::max(n_data / 1024, static_cast<uint64_t>(1));
   AllocationPtr data_allocation =
       Alloc(platform::CUDAPlace(0), n_data * sizeof(int));
   AllocationPtr sum_allocation = Alloc(platform::CUDAPlace(0), sizeof(int));
@@ -115,8 +115,8 @@ TEST(ManagedMemoryTest, OversubscribeGPUMemoryTest) {
   int* sum = static_cast<int*>(sum_allocation->ptr());
   (*sum) = 0;
 
-  write_kernel<<<5120, 1024>>>(data, n_data, step);
-  sum_kernel<<<5120, 1024>>>(data, n_data, step, sum);
+  write_kernel<<<1, 1024>>>(data, n_data, step);
+  sum_kernel<<<1, 1024>>>(data, n_data, step, sum);
 
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index f644d2f5875da..0906567dbf6c1 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -107,7 +107,7 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
     break
 
 #define MEMORY_STAT_FUNC(item, id, func, ...)                         \
-  do {                                                                \
+  [&] {                                                               \
     paddle::memory::StatBase* stat = nullptr;                         \
     switch (id) {                                                     \
       MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                         \
@@ -133,8 +133,8 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
             id));                                                     \
         break;                                                        \
     }                                                                 \
-    stat->func(__VA_ARGS__);                                          \
-  } while (0)
+    return stat->func(__VA_ARGS__);                                   \
+  }()
 
 #define MEMORY_STAT_CURRENT_VALUE(item, id) \
   MEMORY_STAT_FUNC(item, id, GetCurrentValue)
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index b974f606720b2..8354650df0237 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -68,6 +68,18 @@ class UpdateLossScalingOp : public framework::OperatorWithKernel {
 
     return framework::OpKernelType(dtype, ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+#ifndef PADDLE_WITH_XPU
+    if (var_name == "FoundInfinite" || var_name == "StopUpdate") {
+      return expected_kernel_type;
+    }
+#endif
+    return framework::OperatorWithKernel::GetKernelTypeForVar(
+        var_name, tensor, expected_kernel_type);
+  }
 };
 
 class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -93,6 +105,10 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
     AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
     AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
+    AddOutput("StopUpdate",
+              "(Tensor) 1-dim tensor. Stop updating loss scaling, and just "
+              "zero inputs. It has higher priority than Attr(stop_update).")
+        .AsDispensable();
     AddAttr<int>("incr_every_n_steps",
                  "A value represents increasing loss scaling every n "
                  "consecutive steps with finite gradients.");
@@ -131,8 +147,8 @@ decr_every_n_nan_or_inf steps and each step some gradients are infinite.
   }
 };
 
-template <typename T>
-class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
+template <typename T, bool IsFoundInfOnCPU>
+class UpdateLossScalingFunctor<platform::CPUDeviceContext, T, IsFoundInfOnCPU> {
  public:
   void operator()(const platform::CPUDeviceContext& ctx,
                   const bool* found_inf_data, const T* pre_loss_scaling_data,
@@ -141,6 +157,10 @@ class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
                   const int decr_every_n_nan_or_inf, const float incr_ratio,
                   const float decr_ratio, T* updated_loss_scaling_data,
                   int* good_out_data, int* bad_out_data) const {
+    PADDLE_ENFORCE_EQ(
+        IsFoundInfOnCPU, true,
+        platform::errors::InvalidArgument(
+            "The Input(FoundInfinite) should be on the CPUPlace."));
     Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
               incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
               decr_ratio, updated_loss_scaling_data, good_out_data,
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
index 6d9cd96a3fb9a..43f8f84578c70 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -21,9 +21,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename FoundNanInfFlagT>
 __global__ void GpuUpdateLossScaling(
-    const bool* found_inf_data, const T* pre_loss_scaling_data,
+    const FoundNanInfFlagT found_inf_data, const T* pre_loss_scaling_data,
     const int* good_in_data, const int* bad_in_data,
     const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
     const float incr_ratio, const float decr_ratio,
@@ -70,8 +70,9 @@ __global__ void FusedFillIf(T** outs, const size_t xs_size,
   }
 }
 
-template <typename T>
-class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
+template <typename T, bool IsFoundInfOnCPU>
+class UpdateLossScalingFunctor<platform::CUDADeviceContext, T,
+                               IsFoundInfOnCPU> {
  public:
   void operator()(const platform::CUDADeviceContext& dev_ctx,
                   const bool* found_inf_data, const T* pre_loss_scaling_data,
@@ -80,10 +81,17 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
                   const int decr_every_n_nan_or_inf, const float incr_ratio,
                   const float decr_ratio, T* updated_loss_scaling_data,
                   int* good_out_data, int* bad_out_data) const {
-    GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
-        found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
-        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
-        updated_loss_scaling_data, good_out_data, bad_out_data);
+    if (IsFoundInfOnCPU) {
+      GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+          *found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+          incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+          updated_loss_scaling_data, good_out_data, bad_out_data);
+    } else {
+      GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+          found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+          incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+          updated_loss_scaling_data, good_out_data, bad_out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
index d6eddd36a4551..41eb94247f593 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.h
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -40,8 +41,16 @@ inline HOSTDEVICE bool check_finite(T value) {
 #endif
 }
 
-template <typename T>
-inline HOSTDEVICE void Update(const bool* found_inf_data,
+inline HOSTDEVICE bool IsFoundNanInf(const bool found_nan_inf_data) {
+  return found_nan_inf_data;
+}
+
+inline HOSTDEVICE bool IsFoundNanInf(const bool* found_nan_inf_data) {
+  return *found_nan_inf_data;
+}
+
+template <typename T, typename FoundInfFlagT>
+inline HOSTDEVICE void Update(const FoundInfFlagT found_inf_data,
                               const T* pre_loss_scaling_data,
                               const int* good_in_data, const int* bad_in_data,
                               const int incr_every_n_steps,
@@ -49,7 +58,7 @@ inline HOSTDEVICE void Update(const bool* found_inf_data,
                               const float incr_ratio, const float decr_ratio,
                               T* updated_loss_scaling_data, int* good_out_data,
                               int* bad_out_data) {
-  if (*found_inf_data) {
+  if (IsFoundNanInf(found_inf_data)) {
     *good_out_data = 0;
     *bad_out_data = *bad_in_data + 1;
     if (*bad_out_data == decr_every_n_nan_or_inf) {
@@ -72,7 +81,7 @@ inline HOSTDEVICE void Update(const bool* found_inf_data,
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, bool IsFoundInfOnCPU>
 class UpdateLossScalingFunctor {
  public:
   void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
@@ -106,9 +115,33 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> {
                       platform::errors::InvalidArgument(
                           "FoundInfinite must has only one element."));
     const bool* found_inf_data = found_inf->data<bool>();
+    bool is_found_inf_on_cpu = platform::is_cpu_place(found_inf->place());
+
+    if (is_found_inf_on_cpu) {
+      if (*found_inf_data) {
+        phi::funcs::SetConstant<DeviceContext, T> set_constant;
+        for (auto* out : outs) {
+          out->mutable_data<T>(dev_ctx.GetPlace());
+          set_constant(dev_ctx, out, static_cast<T>(0));
+        }
+      }
+    } else {
+      LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
+    }
 
-    LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
-    const bool stop_update = ctx.Attr<bool>("stop_update");
+    const auto* stop_update_tensor = ctx.Input<Tensor>("StopUpdate");
+    bool stop_update = false;
+    if (stop_update_tensor && stop_update_tensor->IsInitialized()) {
+      if (platform::is_cpu_place(stop_update_tensor->place())) {
+        stop_update = stop_update_tensor->data<bool>()[0];
+      } else {
+        framework::Tensor tmp_tensor;
+        framework::TensorCopySync(*stop_update_tensor, platform::CPUPlace(),
+                                  &tmp_tensor);
+        stop_update = tmp_tensor.data<bool>()[0];
+      }
+    }
+    stop_update |= ctx.Attr<bool>("stop_update");
     if (stop_update) {
       return;
     }
@@ -133,10 +166,17 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> {
         ctx.Attr<int>("decr_every_n_nan_or_inf");
     const float incr_ratio = ctx.Attr<float>("incr_ratio");
     const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
-        dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
-        bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
-        decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    if (is_found_inf_on_cpu) {
+      UpdateLossScalingFunctor<DeviceContext, MPDType, true>{}(
+          dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+          bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+          decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    } else {
+      UpdateLossScalingFunctor<DeviceContext, MPDType, false>{}(
+          dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+          bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+          decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 1393da7dd57a7..5808841333f08 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -131,7 +131,8 @@ void Update(const platform::NPUDeviceContext& ctx,
 }
 
 template <typename T>
-class UpdateLossScalingFunctor<platform::NPUDeviceContext, T> {
+class UpdateLossScalingFunctor<platform::NPUDeviceContext, T,
+                               /*IsFoundInfOnCPU=*/true> {
  public:
   void operator()(const platform::NPUDeviceContext& dev_ctx,
                   const std::vector<bool> found_inf_vec,
@@ -236,7 +237,7 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
         ctx.Attr<int>("decr_every_n_nan_or_inf");
     const float incr_ratio = ctx.Attr<float>("incr_ratio");
     const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
+    UpdateLossScalingFunctor<DeviceContext, MPDType, true>{}(
         dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in,
         incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
         updated_loss_scaling, good_out, bad_out);
diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
index da138fb482e5a..0893324c602a8 100644
--- a/paddle/fluid/operators/batch_norm_op_xpu.cc
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -53,8 +53,12 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
             "But received: the size of input's dimensions is [%d]",
             x_dims.size()));
 
-    int N, C, H, W, D;
+    int N = -1, C = -1, H = -1, W = -1, D = -1;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+    N = (N == 0) ? 1 : N;
+    C = (C == 0) ? 1 : C;
+    H = (H == 0) ? 1 : H;
+    W = (W == 0) ? 1 : W;
 
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
@@ -103,12 +107,6 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
                             "The batch_norm XPU API return wrong value[%d %s]",
                             r, XPUAPIErrorMsg[r]));
     } else {
-      PADDLE_ENFORCE_EQ(
-          data_layout_str == "NCHW", true,
-          platform::errors::InvalidArgument(
-              "The batch_norm_infer 'data_layout' attribute must be NCHW. "
-              "But recevived 'data_layout' is [%s].",
-              data_layout_str));
       const auto *mean = ctx.Input<Tensor>("Mean");
       const auto *variance = ctx.Input<Tensor>("Variance");
       const auto *mean_data = mean->data<float>();
@@ -222,8 +220,12 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
             "But received: the size of input's dimensions is [%d]",
             x_dims.size()));
 
-    int N, C, H, W, D;
+    int N = -1, C = -1, H = -1, W = -1, D = -1;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+    N = (N == 0) ? 1 : N;
+    C = (C == 0) ? 1 : C;
+    H = (H == 0) ? 1 : H;
+    W = (W == 0) ? 1 : W;
 
     const auto *x_data = x->data<T>();
     const auto *d_y_data = d_y->data<T>();
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 39acb50d4e870..82d3b1b1dbfea 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -83,7 +83,6 @@ class CCommInitOp : public framework::OperatorBase {
     UniqueId* comm_id = var->GetMutable<UniqueId>();
 
     int nranks = Attr<int>("nranks");
-    int rank_id = Attr<int>("rank");
     int rid = Attr<int>("ring_id");
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -98,8 +97,18 @@ class CCommInitOp : public framework::OperatorBase {
     if (Attr<int>("device_id") >= 0) {
       device_id = Attr<int>("device_id");
     }
+
+#if defined(PADDLE_WITH_XPU_BKCL) && defined(PADDLE_WITH_HETERPS) && \
+    defined(PADDLE_WITH_PSLIB)
+    // XPUPS rank_id only equals 0, so replace rank_id with device_id
+    CommContext::Instance().CreateComm(comm_id, nranks, device_id, device_id,
+                                       rid);
+#else
+    int rank_id = Attr<int>("rank");
     CommContext::Instance().CreateComm(comm_id, nranks, rank_id, device_id,
                                        rid);
+#endif
+
 #endif
   }
 };
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 42584948e0651..088366dbc8f69 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -76,7 +76,15 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
     auto dev_ctx = static_cast<platform::MLUDeviceContext*>(
         platform::DeviceContextPool::Instance().Get(place));
     platform::MLUStreamSync(dev_ctx->stream());
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on xpu place only for now."));
 
+    auto dev_ctx = static_cast<platform::XPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    dev_ctx->Wait();
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
@@ -97,3 +105,5 @@ REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
 REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
 
 REGISTER_OP_MLU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
+
+REGISTER_OP_XPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 37ce4ef7ee21d..5a9a00aa8e4d2 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 
@@ -28,6 +27,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -94,7 +97,16 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
     auto stream =
         platform::CNCLCommContext::Instance().Get(ring_id, place)->stream();
     platform::MLUStreamSync(stream);
-
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on xpu place only for now."));
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto comm_dev_ctx = platform::BKCLCommContext::Instance()
+                            .Get(ring_id, place)
+                            ->dev_context();
+    comm_dev_ctx->Wait();
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
@@ -115,3 +127,5 @@ REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
 REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
 
 REGISTER_OP_MLU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
+
+REGISTER_OP_XPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index 784385d79bd3e..96b27a833fba3 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/phi/api/include/tensor.h"
+
 namespace paddle {
 namespace operators {
 
@@ -42,6 +45,20 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
 
     gpuStream_t stream = nullptr;
     auto place = ctx.GetPlace();
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      // Use ProcessGroup
+      distributed::ProcessGroup *pg = map->get(rid);
+      std::vector<phi::DenseTensor> out_tensor;
+      auto out_shape = ctx.Attr<std::vector<int>>("out_shape");
+      auto out = ctx.Output<framework::LoDTensor>("Out");
+      auto out_dims = out->dims();
+      out->mutable_data<T>(out_dims, place);
+
+      out_tensor.emplace_back(*out);
+      auto task = pg->Recv(out_tensor, peer);
+      return;
+    }
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
index 6a2244b91025a..c31f1210f0422 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/phi/api/include/tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -35,6 +37,15 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
         platform::ToHCCLDataType(framework::TransToProtoVarType(out->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(ring_id)) {
+      // Use ProcessGroup
+      distributed::ProcessGroup* pg = map->get(ring_id);
+      std::vector<phi::DenseTensor> out_tensor;
+      out_tensor.emplace_back(*out);
+      auto task = pg->Recv(out_tensor, 0);
+      return;
+    }
     auto place = ctx.GetPlace();
     auto comm =
         paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 3e565d1b975bc..add352306fa28 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/phi/api/include/tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -39,6 +41,16 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
         peer, 0,
         platform::errors::InvalidArgument(
             "The peer (%d) for send_v2 op must be non-negative.", peer));
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      // Use ProcessGroup
+      distributed::ProcessGroup* pg = map->get(rid);
+      std::vector<phi::DenseTensor> in_tensor;
+      auto x = ctx.Input<framework::LoDTensor>("X");
+      in_tensor.push_back(*x);
+      auto task = pg->Send(in_tensor, peer);
+      return;
+    }
     gpuStream_t stream = nullptr;
     auto place = ctx.GetPlace();
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
index 3bc5487371bac..2d7382f3dfd70 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/phi/api/include/tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -34,6 +36,16 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
         platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(ring_id)) {
+      // Use ProcessGroup
+      distributed::ProcessGroup* pg = map->get(ring_id);
+      std::vector<phi::DenseTensor> in_tensor;
+      auto x = ctx.Input<framework::LoDTensor>("X");
+      in_tensor.push_back(*x);
+      auto task = pg->Send(in_tensor, 1);
+      return;
+    }
     auto place = ctx.GetPlace();
     auto comm =
         paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc
index e4751f1f26008..cc5c20d392809 100644
--- a/paddle/fluid/operators/conv_op_xpu.cc
+++ b/paddle/fluid/operators/conv_op_xpu.cc
@@ -38,9 +38,10 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
     const std::string padding_algorithm =
         context.Attr<std::string>("padding_algorithm");
 
-    PADDLE_ENFORCE_EQ(data_format == "NHWC" || data_format == "NDHWC", false,
-                      platform::errors::InvalidArgument(
-                          ("XPU do support data_format is NCHW in conv op.")));
+    PADDLE_ENFORCE_EQ(
+        data_format == "NDHWC", false,
+        platform::errors::InvalidArgument(
+            ("XPU does not support data_format is NDHWC in conv op.")));
 
     framework::DDim in_data_dims =
         phi::slice_ddim(input->dims(), 2, input->dims().size());
@@ -50,11 +51,18 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
                              in_data_dims, strides, ksize);
 
-    const int batch_size = static_cast<int>(input->dims()[0]);
-    const int img_c = static_cast<int>(input->dims()[1]);
-    const int img_h = static_cast<int>(input->dims()[2]);
-    const int img_w = static_cast<int>(input->dims()[3]);
-    const int f = static_cast<int>(filter.dims()[0]);
+    int batch_size = static_cast<int>(input->dims()[0]);
+    int img_c = static_cast<int>(input->dims()[1]);
+    int img_h = static_cast<int>(input->dims()[2]);
+    int img_w = static_cast<int>(input->dims()[3]);
+    int f = static_cast<int>(filter.dims()[0]);
+    bool is_nchw = true;
+    if (data_format == "NHWC") {
+      img_c = static_cast<int>(input->dims()[3]);
+      img_h = static_cast<int>(input->dims()[1]);
+      img_w = static_cast<int>(input->dims()[2]);
+      is_nchw = false;
+    }
 
     const XPUT *input_data = reinterpret_cast<const XPUT *>(input->data<T>());
     const XPUT *filter_data = reinterpret_cast<const XPUT *>(filter.data<T>());
@@ -64,7 +72,7 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
     int r = xpu::conv2d<XPUT, XPUT, XPUT, int16_t>(
         dev_ctx.x_context(), input_data, filter_data, output_data, batch_size,
         img_c, img_h, img_w, f, ksize, strides, paddings, dilations, groups,
-        nullptr, nullptr, nullptr, true);
+        nullptr, nullptr, nullptr, is_nchw);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External("XPU conv kernel return wrong value[%d %s]",
@@ -99,9 +107,9 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
         context.Attr<std::string>("padding_algorithm");
 
     PADDLE_ENFORCE_EQ(
-        data_format == "NHWC" || data_format == "NDHWC", false,
+        data_format == "NDHWC", false,
         platform::errors::InvalidArgument(
-            ("XPU do support data_format is NCHW in conv grad op.")));
+            ("XPU doesn't support data_format is NDHWC in conv grad op.")));
 
     framework::DDim in_data_dims =
         phi::slice_ddim(input->dims(), 2, input->dims().size());
@@ -111,11 +119,18 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
                              in_data_dims, strides, ksize);
 
-    const int batch_size = static_cast<int>(input->dims()[0]);
-    const int img_c = static_cast<int>(input->dims()[1]);
-    const int img_h = static_cast<int>(input->dims()[2]);
-    const int img_w = static_cast<int>(input->dims()[3]);
-    const int f = static_cast<int>(filter.dims()[0]);
+    int batch_size = static_cast<int>(input->dims()[0]);
+    int img_c = static_cast<int>(input->dims()[1]);
+    int img_h = static_cast<int>(input->dims()[2]);
+    int img_w = static_cast<int>(input->dims()[3]);
+    int f = static_cast<int>(filter.dims()[0]);
+    bool is_nchw = true;
+    if (data_format == "NHWC") {
+      img_c = static_cast<int>(input->dims()[3]);
+      img_h = static_cast<int>(input->dims()[1]);
+      img_w = static_cast<int>(input->dims()[2]);
+      is_nchw = false;
+    }
 
     const XPUT *input_data = reinterpret_cast<const XPUT *>(input->data<T>());
     const XPUT *filter_data = reinterpret_cast<const XPUT *>(filter.data<T>());
@@ -136,7 +151,7 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
         dev_ctx.x_context(), input_data, filter_data, output_grad_data,
         input_grad_data, filter_grad_data, batch_size, img_c, img_h, img_w, f,
         ksize, strides, paddings, dilations, groups, nullptr, nullptr, nullptr,
-        nullptr, nullptr, true);
+        nullptr, nullptr, is_nchw);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External("XPU conv kernel return wrong value[%d %s]",
diff --git a/paddle/fluid/operators/dropout_op_mlu.cc b/paddle/fluid/operators/dropout_op_mlu.cc
new file mode 100644
index 0000000000000..b88974a51ceff
--- /dev/null
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class DropoutMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto dropout_prob = ctx.Attr<float>("dropout_prob");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto* seed_tensor =
+        ctx.HasInput("Seed") ? ctx.Input<Tensor>("Seed") : nullptr;
+    auto dropout_implementation =
+        ctx.Attr<std::string>("dropout_implementation");
+
+    const bool is_upscale = (dropout_implementation == "upscale_in_train");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc out_desc(*out);
+
+    if (!is_test) {
+      // exec dropout op for training only.
+      int seed_data = 0;
+      if (seed_tensor) {
+        if (platform::is_mlu_place(seed_tensor->place())) {
+          memory::Copy(platform::CPUPlace(), &seed_data, seed_tensor->place(),
+                       seed_tensor->data<int>(), sizeof(int));
+        } else {
+          seed_data = *(seed_tensor->data<int>());
+        }
+      } else {
+        seed_data = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
+      }
+
+      auto* mask = ctx.Output<Tensor>("Mask");
+      mask->mutable_data<uint8_t>(ctx.GetPlace());
+      MLUCnnlTensorDesc mask_desc(*mask);
+      // Special case when dropout_prob is 1.0
+      if (dropout_prob == 1.0f) {
+        auto value_t = static_cast<T>(0.0f);
+        MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t, out_desc.get(),
+                      GetBasePtr(out));
+        MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t, mask_desc.get(),
+                      GetBasePtr(mask));
+        return;
+      }
+
+      // create mlu random generator
+      const int device_id = ctx.GetPlace().GetDeviceId();
+      auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data);
+
+      const float prob = is_upscale ? dropout_prob : 0.0f;
+      MLUCnnl::FusedDropout(
+          ctx, mlu_gen_random->get(), x_desc.get(), GetBasePtr(x), prob,
+          GetBasePtr(&(mlu_gen_random->get_state())), mask_desc.get(),
+          GetBasePtr(mask), out_desc.get(), GetBasePtr(out));
+    } else {
+      // exec dropout op for inference only.
+      if (is_upscale) {
+        framework::TensorCopy(
+            *x, ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(), out);
+      } else {
+        float scale = static_cast<T>(1.0f - dropout_prob);
+        Tensor scale_tensor(x->dtype());
+        scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
+        MLUCnnlTensorDesc scale_desc(scale_tensor);
+        MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &scale, scale_desc.get(),
+                      GetBasePtr(&scale_tensor));
+
+        auto data_type = ToCnnlDataType<T>();
+        MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, data_type,
+                                           CNNL_NOT_PROPAGATE_NAN);
+        MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(),
+                          GetBasePtr(x), scale_desc.get(),
+                          GetBasePtr(&scale_tensor), out_desc.get(),
+                          GetBasePtr(out), data_type);
+      }
+    }
+  }
+};
+
+template <typename T>
+class DropoutGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(!ctx.Attr<bool>("is_test"), true,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
+    auto* grad_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* grad_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* mask = ctx.Input<Tensor>("Mask");
+    auto dropout_prob = ctx.Attr<float>("dropout_prob");
+    auto dropout_impl = ctx.Attr<std::string>("dropout_implementation");
+
+    grad_x->mutable_data<T>(ctx.GetPlace());
+    MLUCnnlTensorDesc grad_x_desc(*grad_x);
+
+    if (dropout_prob == 1.) {
+      auto value_t = static_cast<T>(0.0f);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t, grad_x_desc.get(),
+                    GetBasePtr(grad_x));
+      return;
+    }
+
+    // cast mask from uint8 to float32/float16
+    Tensor cast_mask(grad_x->dtype());
+    cast_mask.Resize(mask->dims());
+    cast_mask.mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc mask_desc(*mask);
+    MLUCnnlTensorDesc cast_mask_desc(cast_mask);
+    cnnlCastDataType_t cast_type =
+        GetCastDataType(framework::TransToProtoVarType(mask->dtype()),
+                        framework::TransToProtoVarType(cast_mask.dtype()));
+
+    MLUCnnl::Cast(ctx, cast_type, mask_desc.get(), GetBasePtr(mask),
+                  cast_mask_desc.get(), GetBasePtr(&cast_mask));
+
+    const bool is_upscale = (dropout_impl == "upscale_in_train");
+    const float scale = is_upscale ? (1.0f / (1.0f - dropout_prob)) : (1.0f);
+
+    auto data_type = ToCnnlDataType<T>();
+    MLUCnnlTensorDesc grad_out_desc(*grad_out);
+    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, data_type,
+                                       CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), cast_mask_desc.get(),
+                      GetBasePtr(&cast_mask), grad_out_desc.get(),
+                      GetBasePtr(grad_out), grad_x_desc.get(),
+                      GetBasePtr(grad_x), data_type, scale);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(dropout, ops::DropoutMLUKernel<float>,
+                       ops::DropoutMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(dropout_grad, ops::DropoutGradMLUKernel<float>,
+                       ops::DropoutGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
new file mode 100644
index 0000000000000..8fdde1ccdc058
--- /dev/null
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+class EinsumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+};
+
+class EinsumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Operands", "(TensorList), The input tensor of einsum op.")
+        .AsDuplicable();
+    AddOutput("Out", "(Tensor), The output tensor of einsum op.");
+    AddAttr<std::string>("equation",
+                         "(string) A einsum equation. such as `ij,jk->ik`"
+                         "There must have `->` and the number of operands in "
+                         "equation must equals the `Operands` length.");
+    AddComment(R"DOC(
+Einsum Operator.
+
+This operator is used to perform einsum operation for given operands and equation.
+)DOC");
+  }
+};
+
+class EinsumGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto x_name = "Operands";
+    auto x_grad_name = framework::GradVarName(x_name);
+    ctx->SetOutputsDim(x_grad_name, ctx->GetInputsDim(x_name));
+    ctx->ShareAllLoD(x_name, x_grad_name);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto dtype = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class EinsumGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("einsum_grad");
+    retv->SetInput("Operands", this->Input("Operands"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput(framework::GradVarName("Operands"),
+                    this->InputGrad("Operands", false));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(einsum, EinsumInferShapeFunctor,
+                            PD_INFER_META(phi::EinsumInferMeta));
+
+REGISTER_OPERATOR(einsum, ops::EinsumOp, ops::EinsumOpMaker,
+                  EinsumInferShapeFunctor,
+                  ops::EinsumGradMaker<paddle::framework::OpDesc>,
+                  ops::EinsumGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(einsum_grad, ops::EinsumGradOp);
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 80e7f5c001d4b..68b9051d85831 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -19,6 +19,7 @@ register_operators(EXCLUDES
     fused_attention_op
     fused_transformer_op
     fused_feedforward_op
+    fused_multi_transformer_op
     resnet_unit_op
     fused_gemm_epilogue_op)
 
@@ -73,6 +74,7 @@ if (WITH_GPU OR WITH_ROCM)
         op_library(fused_feedforward_op)
         # fused_attention_op
         op_library(fused_attention_op)
+        op_library(fused_multi_transformer_op)
     endif()
     # resnet_unit needs cudnn 8.0 above
     if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index b3ac3606eaf8e..c5adee547bdac 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h"
 #include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
@@ -33,6 +34,7 @@ namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
 USE_OP_ITSELF(batch_norm);
+PD_DECLARE_KERNEL(batch_norm, GPU, ALL_LAYOUT);
 USE_CUDA_ONLY_OP(fused_bn_add_activation);
 USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
 
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index a80f590aa495d..884fca2c1b0b8 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -164,6 +164,7 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
   attrs.insert({"groups", groups});
   attrs.insert({"exhaustive_search", exhaustive_search});
   attrs.insert({"use_addto", use_addto});
+  attrs.insert({"workspace_size_MB", 512});
 
   auto op = framework::OpRegistry::CreateOp(
       "conv2d_grad", {{"Input", {"Input"}},
@@ -408,7 +409,7 @@ TEST(CudnnNormConvFp16, K1S1) {
   platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() <= 70) {
+  if (ctx->GetComputeCapability() < 70) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -434,7 +435,7 @@ TEST(CudnnNormConvFp16, K3S1) {
   platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() <= 70) {
+  if (ctx->GetComputeCapability() < 70) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -460,7 +461,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
   platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() <= 70) {
+  if (ctx->GetComputeCapability() < 70) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 6bf3a7114f4ce..0fe76fa23a637 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
     const platform::CUDADeviceContext &ctx, const uint32_t rows,
     const uint32_t cols, const int vec_size) {
   const uint32_t tmp_cols = cols / vec_size;
-  int threads = std::max(
-      static_cast<uint32_t>(32),
-      std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
+  // NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
+  // needs too many register resources. If data_type is float16, CUDA
+  // error(701) will occur when block_size is 1024. Which error is
+  // 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
+  // occur because it did not have appropriate resources.
+  // Of course, this kernel can be optimized later to reduce the use
+  // of registers.
+  int threads =
+      std::max(static_cast<uint32_t>(32),
+               std::min(tmp_cols, static_cast<uint32_t>(std::min(
+                                      ctx.GetMaxThreadsPerBlock(), 512))));
   const auto blocks_x =
       std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
   const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index d53a24a57e3cc..aa613dd3f5ce0 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -156,9 +156,9 @@ __global__ void FusedLayernormResidualDropoutBias(
 }
 
 /*
-* @brief layernorm(residual + dropout(x));
+ * @brief layernorm(residual + dropout(x));
  * Conditions:
- * (1) The number of cols is 1024;
+ * (1) The number of cols is 768/1024/4096;
  * (2) layer_norm scale and bias is not null;
  * (3) linear bias is null;
  * @param
@@ -166,6 +166,7 @@ __global__ void FusedLayernormResidualDropoutBias(
  * cols: 1024
  * x_: [rows, cols], inputs
  * residual_:[rows, cols]
+ * bias_: [cols], linear bias, can be null
  * gamma_: [cols]: layernorm scale, not null
  * beta_: [cols], layernorm bias, not null
  * mask_out_: [rows, cols], dropout result
@@ -173,7 +174,7 @@ __global__ void FusedLayernormResidualDropoutBias(
  * y_: [rows, cols], layernorm result
  * mean_out_: [rows]: layernorm means
  * var_out_: [rows]: layernorm vars
-*/
+ */
 template <
     typename T, typename U, typename ScaleT = U, typename MaskType = uint8_t,
     int VecSize = 8, int WARPS_M = 4, int WARPS_N = 1, int BYTES_PER_LDG = 16,
@@ -182,14 +183,16 @@ template <
     int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW, int ROWS_PER_CTA = WARPS_M,
     int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
     int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
-__global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
+__global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
     int rows, int cols, uint64_t seed, const float dropout_prob,
     const bool is_upscale_in_train, const bool is_test,
     const uint64_t increment, const float epsilon, const T *__restrict__ x_ptr,
-    const T *__restrict__ residual_ptr, const ScaleT *__restrict__ gamma_ptr,
-    const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr,
-    U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
-    T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) {
+    const T *__restrict__ residual_ptr, const T *__restrict__ bias_ptr,
+    const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
+    MaskType *__restrict__ mask_out_ptr, U *__restrict__ mean_out_ptr,
+    U *__restrict__ var_out_ptr, T *__restrict__ residual_out_ptr,
+    T *__restrict__ y_ptr) {
+  __shared__ U smem[WARPS_M * WARPS_N];
   using Vec = phi::AlignedVector<T, VecSize>;
   using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
   using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
@@ -204,12 +207,22 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
   const int c = warp_n * THREADS_PER_WARP + lane;  // lane
   const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
 
-  int idx = r * LN_NUM_COLS + c;
+  int idx = r * ELTS_PER_ROW + c;
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx, increment, &state);
 
   T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
 
+  // bias
+  Vec bias[LDGS];
+  if (bias_ptr != nullptr) {
+#pragma unroll
+    for (int it = 0, col = c; it < LDGS; it++) {
+      phi::Load<T, VecSize>(bias_ptr + col * VecSize, &bias[it]);
+      col += THREADS_PER_ROW;
+    }
+  }
+
   Vec_scale gamma[LDGS];
   Vec_scale beta[LDGS];
 #pragma unroll
@@ -219,14 +232,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     col += THREADS_PER_ROW;
   }
 
-  constexpr U rn = 1.f / U(LN_NUM_COLS);
+  constexpr U rn = 1.f / U(ELTS_PER_ROW);
   for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
     Vec x[LDGS];
     Vec residual[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
-      phi::Load<T, VecSize>(residual_ptr + row * LN_NUM_COLS + col * VecSize,
+      phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]);
+      phi::Load<T, VecSize>(residual_ptr + row * ELTS_PER_ROW + col * VecSize,
                             &residual[it]);
       col += THREADS_PER_ROW;
     }
@@ -255,14 +268,28 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 
     // 4 * 8
     U xf[LDGS * VecSize];
+    if (bias_ptr != nullptr) {
 #pragma unroll
-    for (int it = 0; it < LDGS; it++) {
+      for (int it = 0; it < LDGS; it++) {
 #pragma unroll
-      for (int jt = 0; jt < VecSize; jt++) {
-        // dropout(x) + residual
-        x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
-                    residual[it][jt];
-        xf[it * VecSize + jt] = U(x[it][jt]);
+        for (int jt = 0; jt < VecSize; jt++) {
+          // dropout(x) + residual
+          x[it][jt] = (x[it][jt] + bias[it][jt]) *
+                          static_cast<T>(mask_vec[it][jt]) * factor +
+                      residual[it][jt];
+          xf[it * VecSize + jt] = U(x[it][jt]);
+        }
+      }
+    } else {
+#pragma unroll
+      for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+        for (int jt = 0; jt < VecSize; jt++) {
+          // dropout(x) + residual
+          x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
+                      residual[it][jt];
+          xf[it * VecSize + jt] = U(x[it][jt]);
+        }
       }
     }
 
@@ -270,9 +297,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
       phi::Store<T, VecSize>(
-          x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize);
+          x[it], residual_out_ptr + row * ELTS_PER_ROW + col * VecSize);
       phi::Store<MaskType, VecSize>(
-          mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize);
+          mask_vec[it], mask_out_ptr + row * ELTS_PER_ROW + col * VecSize);
       col += THREADS_PER_ROW;
     }
 
@@ -289,6 +316,22 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
     }
+    if (WARPS_N > 1) {
+      if (lane == 0) {
+        smem[warp_m * WARPS_N + warp_n] = mu_local;
+      }
+      __syncthreads();
+      if (tidx == 0) {
+        mu_local = 0.f;
+#pragma unroll
+        for (int it = 0; it < WARPS_N; ++it) {
+          mu_local += smem[warp_m * WARPS_N + it];
+        }
+        smem[warp_m] = mu_local;
+      }
+      __syncthreads();
+      mu_local = smem[warp_m];
+    }
     mu_local *= rn;
     if (lane == 0) {
       mean_out_ptr[row] = mu_local;
@@ -308,6 +351,22 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
     }
+    if (WARPS_N > 1) {
+      if (lane == 0) {
+        smem[warp_m * WARPS_N + warp_n] = var_local;
+      }
+      __syncthreads();
+      if (tidx == 0) {
+        var_local = 0.f;
+#pragma unroll
+        for (int it = 0; it < WARPS_N; ++it) {
+          var_local += smem[warp_m * WARPS_N + it];
+        }
+        smem[warp_m] = var_local;
+      }
+      __syncthreads();
+      var_local = smem[warp_m];
+    }
     U rsigma = rsqrtf(var_local * rn + epsilon);
     if (lane == 0) {
       // Note: the stored var is different for paddle(ln) and apex (fast ln).
@@ -332,7 +391,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
@@ -390,12 +449,37 @@ void LaunchLayernormResidualDropoutBias(
     return;
   }
 
-  bool can_call_1024_kernel = false;
-  if (cols == 1024 && scale != nullptr && layernorm_bias != nullptr &&
-      bias == nullptr) {
-    can_call_1024_kernel = true;
+#define LAUNCH_FUSED_FAST_LN_KERNEL_BASE(cols)                                \
+  case (cols): {                                                              \
+    constexpr int WARPS_N = cols < 1024 ? 1 : (cols / 1024);                  \
+    constexpr int WARPS_M = 4 / WARPS_N;                                      \
+    const int THREADS_PER_WARP = 32;                                          \
+    const int BYTES_PER_LDG = 16;                                             \
+    const int VecSize = BYTES_PER_LDG / sizeof(T);                            \
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;         \
+    const int ROWS_PER_CTA = WARPS_M;                                         \
+    const int grid =                                                          \
+        static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA))); \
+    fused_fast_ln_fwd_kernel<                                                 \
+        T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, uint8_t,     \
+        VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG,                             \
+        cols><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(                    \
+        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,         \
+        increment, epsilon, src, residual, bias, scale, layernorm_bias,       \
+        mask_data, mean, var, dst, layernorm_dst);                            \
+  } break
+
+#define LAUNCH_FUSED_FAST_LN_KERNEL       \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(768);  \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1024); \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(4096)
+
+  bool can_call_fast_ln_kernel = false;
+  if ((cols == 768 || cols == 1024 || cols == 4096) && scale != nullptr &&
+      layernorm_bias != nullptr) {
+    can_call_fast_ln_kernel = true;
   }
-  VLOG(6) << "can_call_1024_kernel = " << can_call_1024_kernel;
+  VLOG(6) << "can_call_fast_ln_kernel = " << can_call_fast_ln_kernel;
 
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
   if (cols % VecSize != 0) {
@@ -407,26 +491,15 @@ void LaunchLayernormResidualDropoutBias(
         epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
         layernorm_dst, mean, var);
   } else {
-    if (can_call_1024_kernel) {
-      const int WARPS_M = 4;
-      const int WARPS_N = 1;
-      const int THREADS_PER_WARP = 32;
-      const int BYTES_PER_LDG = 16;
-      const int VecSize = BYTES_PER_LDG / sizeof(T);
-
-      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
-      const int ROWS_PER_CTA = WARPS_M;
-
-      // Note: the grid can not exceed max_grid of the gpu.
-      const int grid =
-          static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA)));
-      fused_ln_fwd_1024_kernel<
-          T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, uint8_t,
-          VecSize, WARPS_M, WARPS_N,
-          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(
-          rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
-          increment, epsilon, src, residual, scale, layernorm_bias, mask_data,
-          mean, var, dst, layernorm_dst);
+    if (can_call_fast_ln_kernel) {
+      switch (cols) {
+        LAUNCH_FUSED_FAST_LN_KERNEL;
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only when column is equal to 768/1024/4096 is supported for "
+              "now"));
+          break;
+      }
     } else {
       int blockDim = GetDesiredBlockDim(cols / VecSize);
       FusedLayernormResidualDropoutBias<
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
new file mode 100644
index 0000000000000..c95ca6fe0c96c
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -0,0 +1,259 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FusedMultiTransformerOp : public framework::OperatorWithKernel {
+ private:
+  static constexpr const char *OpName = "FusedMultiTransformerOp";
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+#define CHECK_INPUT(name) \
+  OP_INOUT_CHECK(ctx->HasInput(#name), "Input", #name, OpName)
+#define CHECK_INPUTS(name) \
+  OP_INOUT_CHECK(ctx->HasInputs(#name), "Input", #name, OpName)
+#define CHECK_OUTPUT(name) \
+  OP_INOUT_CHECK(ctx->HasOutput(#name), "Output", #name, OpName)
+#define CHECK_OUTPUTS(name) \
+  OP_INOUT_CHECK(ctx->HasOutputs(#name), "Output", #name, OpName)
+
+    CHECK_INPUT(X);
+
+    // attention
+    CHECK_INPUTS(QKVW);
+    CHECK_INPUTS(OutLinearW);
+
+    if (ctx->HasInput("TimeStep")) {
+      CHECK_INPUTS(CacheKV);
+    }
+
+    if (ctx->HasInputs("CacheKV")) {
+      CHECK_OUTPUTS(CacheKVOut);
+    }
+
+    // ffn
+    CHECK_INPUTS(FFN1Weight);
+    CHECK_INPUTS(FFN2Weight);
+
+    CHECK_OUTPUT(Out);
+
+    // x: qkv's input [batch_size, seq_len, dim_embed]
+    // y: qkv's weight: [3, num_head, dim_head, dim_embed]
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputsDim("QKVW")[0];
+    PADDLE_ENFORCE_EQ(x_dim.size(), 3, platform::errors::InvalidArgument(
+                                           "The dimensions of x must be 3"
+                                           "(batch_size, seq_len, dim_embed),"
+                                           "but received dimensions of"
+                                           "Input is [%d]",
+                                           x_dim.size()));
+    PADDLE_ENFORCE_EQ(y_dim.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of qkv_weight must be 4"
+                          "(3, num_head, dim_head, dim_embed),"
+                          "but received dimensions of"
+                          "Input is [%d]",
+                          y_dim.size()));
+    PADDLE_ENFORCE_EQ(x_dim[2], y_dim[3],
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the dimension of x_dim[2] and y_dim[3]"
+                          "must be equal. But received: the shape "
+                          "of input x = [%s], and the shape of "
+                          "input qkv_weight = [%s]",
+                          x_dim, y_dim));
+
+    if (ctx->Attrs().Get<int>("ring_id") == -1) {
+      PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3],
+                        platform::errors::InvalidArgument(
+                            "The dimensions of qkv_weight must be 4"
+                            "(3, num_head, dim_head, dim_embed),"
+                            "and must satisfy the limitations: "
+                            "(num_head * dim_head == dim_embed)"));
+    }
+
+    if (ctx->HasInputs("CacheKV")) {
+      // [2, batch_size, num_head, max_seq_len, head_size]
+      const auto &c_dims = ctx->GetInputsDim("CacheKV");
+      const auto &c_dim = c_dims[0];
+
+      PADDLE_ENFORCE_EQ(
+          c_dim.size(), 5,
+          paddle::platform::errors::InvalidArgument(
+              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+      PADDLE_ENFORCE_EQ(c_dim[0], 2,
+                        paddle::platform::errors::InvalidArgument(
+                            "The first dim of CacheKV must be 2, but got %d",
+                            c_dim[0]));  // 2
+      PADDLE_ENFORCE_EQ(c_dim[1], x_dim[0],
+                        paddle::platform::errors::InvalidArgument(
+                            "The second dim of CacheKV must be equal with "
+                            "batch size %d, but got %d",
+                            x_dim[0], c_dim[1]));  // batch_size
+      PADDLE_ENFORCE_EQ(c_dim[2], y_dim[1],
+                        paddle::platform::errors::InvalidArgument(
+                            "The third dim of CacheKV must be equal with num "
+                            "head %d, but got %d",
+                            y_dim[1], c_dim[2]));  // num_head
+      PADDLE_ENFORCE_GT(
+          c_dim[3], 0,
+          paddle::platform::errors::InvalidArgument(
+              "The forth dim of CacheKV must be greater than 0, but got %d",
+              c_dim[3]));  // cache_seq_len
+      PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2],
+                        paddle::platform::errors::InvalidArgument(
+                            "The fifth dim of CacheKV must be equal with head "
+                            "size %d, but got %d",
+                            y_dim[2], c_dim[4]));  // head_size
+    }
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    if (var_name == "TimeStep") {
+      VLOG(10) << "var_name:" << var_name << " need not to transform";
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class FusedMultiTransformerOpOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("LnScale",
+             "Scale is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDuplicable();
+    AddInput("LnBias",
+             "Bias is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDuplicable();
+    AddInput("QKVW", "The qkv weight tensor.").AsDuplicable();
+    AddInput("QKVBias", "The qkv bias tensor.").AsDispensable().AsDuplicable();
+    AddInput("CacheKV", "(optional) The cached KV for generation inference.")
+        .AsDispensable()
+        .AsDuplicable();
+    AddInput("TimeStep",
+             "(optional, int) The time step for generation inference.")
+        .AsDispensable();
+    AddInput("SrcMask", "(optional) The attention mask tensor in fmha.")
+        .AsDispensable();
+    AddInput("OutLinearW", "The out_linear weight tensor.").AsDuplicable();
+    AddInput("OutLinearBias", "The out_linear bias tensor.")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddInput("FFNLnScale", "The layer_norm scale of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFNLnBias", "The layer_norm bias of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFN1Weight", "The linear1 weight of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFN1Bias", "The linear1 bias of FusedFeedForward op")
+        .AsDispensable()
+        .AsDuplicable();
+    AddInput("FFN2Weight", "The linear2 weight of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFN2Bias", "The linear2 bias input of FusedFeedForward op")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddOutput("CacheKVOut", "The updated cache KV. Inplace with CacheKV")
+        .AsDispensable()
+        .AsDuplicable();
+    AddOutput("Out", "Result after multi .");
+
+    AddAttr<bool>("pre_layer_norm",
+                  "if true, the attention op uses pre_layer_norm architecure, "
+                  "else, uses post_layer_norm architecuture. "
+                  "[default true].")
+        .SetDefault(true);
+    AddAttr<float>("epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
+                            platform::errors::InvalidArgument(
+                                "'epsilon' in Op(LayerNorm) should be between"
+                                "0.0 and 0.001, But received [%s].",
+                                epsilon));
+        });
+
+    AddAttr<float>("dropout_rate", "Probability of setting units to zero.")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float &drop_p) {
+          PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'dropout_rate' must be between 0.0 and 1.0."));
+        });
+
+    AddAttr<bool>("dropout_is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "dropout_implementation",
+        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
+        "The meaning is the same as 'attn_dropout_implementation'.")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string &type) {
+          PADDLE_ENFORCE_EQ(
+              type == "downgrade_in_infer" || type == "upscale_in_train", true,
+              platform::errors::InvalidArgument(
+                  "dropout_implementation can only be downgrade_in_infer or "
+                  "upscale_in_train"));
+        });
+    AddAttr<std::string>("act_method", "act_method").SetDefault("gelu");
+
+    AddAttr<int>(
+        "ring_id",
+        "ring id for tensor model parallel. distributed training and inference")
+        .SetDefault(-1);
+
+    AddComment(R"DOC(fused multi transformer layers op)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_multi_transformer, ops::FusedMultiTransformerOp,
+    ops::FusedMultiTransformerOpOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
new file mode 100644
index 0000000000000..e38ac9a0ad2da
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -0,0 +1,1343 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// This file has been adapted from FasterTransformer file:
+// https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu
+// We add License in the head.
+
+#include <cuda_fp16.h>
+#include <float.h>
+#include <cub/cub.cuh>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/operators/fused/attention_layer_norm.h"
+#include "paddle/fluid/operators/fused/attn_gemm.h"
+#include "paddle/fluid/operators/fused/fmha_ref.h"
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+// for debug
+// #define _DEBUG_FUSED_MULTI_TRANSFORMER
+
+template <typename T>
+static void AllReduce(framework::Tensor &tensor,  // NOLINT
+                      const int ring_id,
+                      const platform::CUDADeviceContext &ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
+  int64_t numel = tensor.numel();
+  const void *sendbuff = tensor.data<T>();
+  auto place = ctx.GetPlace();
+  void *recvbuff = tensor.mutable_data<T>(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+  auto stream = ctx.stream();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
+namespace {
+
+namespace plat = paddle::platform;
+using float16 = plat::float16;
+
+#define MMHA_USE_FP32_ACUM_FOR_LOGITS
+#define MMHA_USE_FP32_ACUM_FOR_OUT
+
+template <typename T>
+struct Masked_multihead_attention_params {
+  // output buffer, [B, 1(seq_len), num_head * dim_head]
+  T *out;
+  // qkv_out, [B, 1(seq_len), 3, num_head * dim_head]
+  const T *qkv;
+  // bias, [3, num_head, dim_head]
+  const T *qkv_bias;
+  // TODO(wangxi): optimize with input_lengths and max_input_len?
+  // [bsz, 1, 1, time_step(cache_seq_length)+1]
+  const T *attn_mask;
+
+  // [2, B, num_head, max_seq_len(valid cache_seq_len), dim_head]
+  // k [B, num_head, dim_head/x, max_seq_len, x], that is `seq_len` first
+  // v [B, num_head, max_seq_len, dim_head]
+  T *cache_kv;
+
+  int batch_size;
+  int num_head;
+  int timestep;  // cache_seq_length
+  int max_seq_length;
+
+  // 1.f / sqrt(Dh)
+  float inv_sqrt_dh;
+};
+
+struct Float8_ {
+  float2 x;
+  float2 y;
+  float2 z;
+  float2 w;
+};
+
+// clang-format off
+
+template <typename T, int Dh> struct Qk_vec_ {};
+template <> struct Qk_vec_<float,    32> { using Type = float;    };
+template <> struct Qk_vec_<float,    64> { using Type = float2;   };
+template <> struct Qk_vec_<float,   128> { using Type = float4;   };
+template <> struct Qk_vec_<float16,  32> { using Type = uint32_t; };
+template <> struct Qk_vec_<float16,  64> { using Type = uint32_t; };
+template <> struct Qk_vec_<float16, 128> { using Type = uint2;    };
+
+template <typename T, int THREADS_PER_KEY> struct K_vec_ {};
+template <> struct K_vec_<float,   4> { using Type = float;    };
+template <> struct K_vec_<float,   2> { using Type = float2;   };
+template <> struct K_vec_<float,   1> { using Type = float4;   };
+template <> struct K_vec_<float16, 4> { using Type = uint32_t; };
+template <> struct K_vec_<float16, 2> { using Type = uint2;    };
+template <> struct K_vec_<float16, 1> { using Type = uint4;    };
+
+template <typename T, int V_VEC_SIZE> struct V_vec_ {};
+template <> struct V_vec_<float,   1> { using Type = float;    };
+template <> struct V_vec_<float,   2> { using Type = float2;   };
+template <> struct V_vec_<float,   4> { using Type = float4;   };
+template <> struct V_vec_<float16, 2> { using Type = uint32_t; };
+template <> struct V_vec_<float16, 4> { using Type = uint2;    };
+template <> struct V_vec_<float16, 8> { using Type = uint4;    };
+
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+template <typename T> struct V_vec_acum_fp32_ {};
+// template <> struct V_vec_acum_fp32_<float>  { using Type = float;  };
+// template <> struct V_vec_acum_fp32_<float2> { using Type = float2; };
+template <> struct V_vec_acum_fp32_<float4> { using Type = float4; };
+// template <> struct V_vec_acum_fp32_<uint32_t> { using Type = float2;   };
+// template <> struct V_vec_acum_fp32_<uint2   > { using Type = Float4_;  };
+template <> struct V_vec_acum_fp32_<uint4> { using Type = Float8_; };
+#endif
+
+// clang-format on
+
+inline __device__ float half_to_float(uint16_t h) {
+  float f;
+  asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+  return f;
+}
+
+inline __device__ float2 half2_to_float2(uint32_t v) {
+  uint16_t lo, hi;
+  asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
+  return make_float2(half_to_float(lo), half_to_float(hi));
+}
+
+inline __device__ uint32_t float2_to_half2(float2 f) {
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
+               : "=r"(tmp.u32)
+               : "f"(f.y), "f"(f.x));
+#else
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+#endif
+  return tmp.u32;
+}
+
+inline __device__ float add(float a, float b) { return a + b; }
+
+inline __device__ float2 add(float2 a, float2 b) {
+  float2 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+
+inline __device__ float4 add(float4 a, float4 b) {
+  float4 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+
+inline __device__ uint16_t add(uint16_t a, uint16_t b) {
+  uint16_t c;
+  asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+  return c;
+}
+
+inline __device__ uint32_t add(uint32_t a, uint32_t b) {
+  uint32_t c;
+  asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+  return c;
+}
+
+inline __device__ uint2 add(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+
+inline __device__ uint4 add(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+
+inline __device__ float2 add(uint32_t a, float2 fb) {
+  float2 fa = half2_to_float2(a);
+  return add(fa, fb);
+}
+
+inline __device__ Float8_ add(uint4 a, Float8_ fb) {
+  Float8_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  fc.z = add(a.z, fb.z);
+  fc.w = add(a.w, fb.w);
+  return fc;
+}
+
+template <typename Acc, typename A, typename B>
+inline __device__ Acc mul(A a, B b);
+
+template <>
+inline __device__ float mul<float, float>(float a, float b) {
+  return a * b;
+}
+
+template <>
+inline __device__ float2 mul(float2 a, float2 b) {
+  float2 c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  return c;
+}
+
+template <>
+inline __device__ float4 mul(float4 a, float4 b) {
+  float4 c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  c.z = a.z * b.z;
+  c.w = a.w * b.w;
+  return c;
+}
+
+template <>
+inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
+  uint16_t c;
+  asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+  return c;
+}
+
+template <>
+inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
+  uint32_t c;
+  asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+  return c;
+}
+
+template <>
+inline __device__ uint2 mul(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+  return c;
+}
+
+template <>
+inline __device__ uint4 mul(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+  c.z = mul<uint32_t, uint32_t, uint32_t>(a.z, b.z);
+  c.w = mul<uint32_t, uint32_t, uint32_t>(a.w, b.w);
+  return c;
+}
+
+inline __device__ float sum(float v) { return v; }
+inline __device__ float sum(float2 v) { return v.x + v.y; }
+inline __device__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+inline __device__ float sum(uint16_t v) { return half_to_float(v); }
+inline __device__ float sum(uint32_t v) {
+  float2 tmp = half2_to_float2(v);
+  return tmp.x + tmp.y;
+}
+
+inline __device__ float sum(uint2 v) {
+  uint32_t c = add(v.x, v.y);
+  return sum(c);
+}
+
+inline __device__ float sum(uint4 v) {
+  uint32_t c = add(v.x, v.y);
+  c = add(c, v.z);
+  c = add(c, v.w);
+  return sum(c);
+}
+
+template <typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<T, T, T>(a, b));
+}
+
+template <typename A, typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<A, T, T>(a, b));
+}
+
+inline __device__ constexpr uint32_t shfl_mask(int threads) {
+  return threads == 32 ? uint32_t(-1) : (1u << threads) - 1u;
+}
+
+template <typename T>
+inline __device__ __host__ T div_up(T m, T n) {
+  return (m + n - 1) / n;
+}
+
+inline __device__ float fma(float a, float b, float c) { return a * b + c; }
+
+inline __device__ float2 fma(float2 a, float2 b, float2 c) {
+  float2 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+
+inline __device__ float4 fma(float4 a, float4 b, float4 c) {
+  float4 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+
+inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
+  uint32_t d;
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(d)
+               : "r"(a), "r"(b), "r"(c));
+  return d;
+}
+
+inline __device__ uint2 fma(uint2 a, uint2 b, uint2 c) {
+  uint2 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+
+inline __device__ uint4 fma(uint4 a, uint4 b, uint4 c) {
+  uint4 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+
+inline __device__ float2 fma(float a, float2 b, float2 c) {
+  float2 d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  return d;
+}
+
+inline __device__ float4 fma(float a, float4 b, float4 c) {
+  float4 d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  d.z = fma(a, b.z, c.z);
+  d.w = fma(a, b.w, c.w);
+  return d;
+}
+
+inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
+  Float8_ d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  d.z = fma(a, b.z, c.z);
+  d.w = fma(a, b.w, c.w);
+  return d;
+}
+
+inline __device__ uint32_t h0_h0(uint16_t a) {
+  uint32_t b;
+  asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
+  return b;
+}
+
+inline __device__ uint32_t fma(uint16_t a, uint32_t b, uint32_t c) {
+  return fma(h0_h0(a), b, c);
+}
+
+inline __device__ uint2 fma(uint16_t a, uint2 b, uint2 c) {
+  uint32_t s = h0_h0(a);
+  uint2 d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  return d;
+}
+
+inline __device__ uint4 fma(uint16_t a, uint4 b, uint4 c) {
+  uint32_t s = h0_h0(a);
+  uint4 d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  d.z = fma(s, b.z, c.z);
+  d.w = fma(s, b.w, c.w);
+  return d;
+}
+
+inline __device__ float cast_to_float(float u) { return u; }
+
+inline __device__ float2 cast_to_float(float2 u) { return u; }
+
+inline __device__ float4 cast_to_float(float4 u) { return u; }
+
+inline __device__ Float8_ cast_to_float(uint4 u) {
+  Float8_ tmp;
+  tmp.x = half2_to_float2(u.x);
+  tmp.y = half2_to_float2(u.y);
+  tmp.z = half2_to_float2(u.z);
+  tmp.w = half2_to_float2(u.w);
+  return tmp;
+}
+
+template <int THREADS_PER_KEY, typename K_vec, int N>
+inline __device__ float qk_dot_(const K_vec (&q)[N], const K_vec (&k)[N]) {
+  K_vec qk_vec = mul<K_vec, K_vec, K_vec>(q[0], k[0]);
+#pragma unroll
+  for (int ii = 1; ii < N; ++ii) {
+    qk_vec = fma(q[ii], k[ii], qk_vec);
+  }
+
+  float qk = sum(qk_vec);
+#pragma unroll
+  for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) {
+    qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+  }
+  return qk;
+}
+
+template <typename T, int THREADS_PER_KEY>
+struct Qk_dot {
+  template <typename K_vec, int N>
+  static inline __device__ float dot(const K_vec (&q)[N], const K_vec (&k)[N]) {
+    return qk_dot_<THREADS_PER_KEY>(q, k);
+  }
+};
+
+template <int WARPS_PER_BLOCK, int WARP_SIZE = 32>
+inline __device__ float block_sum(float *red_smem, float sum) {
+  int warp = threadIdx.x / WARP_SIZE;
+  int lane = threadIdx.x % WARP_SIZE;
+
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+  }
+
+  if (lane == 0) {
+    red_smem[warp] = sum;
+  }
+  __syncthreads();
+
+  if (lane < WARPS_PER_BLOCK) {
+    sum = red_smem[lane];
+  }
+
+#pragma unroll
+  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+  }
+
+  return __shfl_sync(uint32_t(-1), sum, 0);
+}
+
+inline __device__ void convert_from_float(float &dst, float src) {  // NOLINT
+  dst = src;
+}
+
+inline __device__ void convert_from_float(float4 &dst, float4 src) {  // NOLINT
+  dst = src;
+}
+
+inline __device__ void convert_from_float(plat::float16 &dst,  // NOLINT
+                                          float src) {
+  dst = static_cast<plat::float16>(src);
+}
+
+inline __device__ void convert_from_float(uint4 &dst, Float8_ src) {  // NOLINT
+  dst.x = float2_to_half2(src.x);
+  dst.y = float2_to_half2(src.y);
+  dst.z = float2_to_half2(src.z);
+  dst.w = float2_to_half2(src.w);
+}
+
+inline __device__ void zero(uint16_t &dst) { dst = uint16_t(0); }  // NOLINT
+
+template <typename T>
+inline __device__ void zero(T &dst) {  // NOLINT
+  constexpr int WORDS = sizeof(T) / 4;
+  union {
+    T raw;
+    uint32_t words[WORDS];
+  } tmp;
+#pragma unroll
+  for (int ii = 0; ii < WORDS; ++ii) {
+    tmp.words[ii] = 0u;
+  }
+  dst = tmp.raw;
+}
+
+template <typename T, int Dh, int THREADS_PER_KEY, int THREADS_PER_VALUE,
+          int THREADS_PER_BLOCK>
+__global__ void masked_multihead_attention_kernel(
+    Masked_multihead_attention_params<T> params) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+
+  static_assert(Dh % THREADS_PER_KEY == 0, "");
+  static_assert(Dh % THREADS_PER_VALUE == 0, "");
+
+  constexpr int WARP_SIZE = 32;
+  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+  extern __shared__ char smem_[];
+
+  float *qk_smem = reinterpret_cast<float *>(smem_);
+
+  char *logits_smem_ = smem_;
+  // fp32 accum for logits
+  float *logits_smem = reinterpret_cast<float *>(logits_smem_);
+
+  T *out_smem = reinterpret_cast<T *>(smem_);
+
+  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+  __shared__ T q_smem[Dh];
+
+  const int bi = blockIdx.y;
+  const int hi = blockIdx.x;
+  const int bhi = bi * params.num_head + hi;
+  const int tid = threadIdx.x;
+
+  float qk_max = -FLT_MAX;
+
+  // qkv [B, S=1, 3, num_head, head_dim]
+  int qkv_base_offset = bi * 3 * params.num_head * Dh + hi * Dh;
+
+  using Qk_vec = typename Qk_vec_<T, Dh>::Type;
+  constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
+  static_assert(Dh % QK_VEC_SIZE == 0 && Dh / QK_VEC_SIZE <= WARP_SIZE, "");
+  constexpr int QK_VECS_PER_WARP = Dh / QK_VEC_SIZE;
+
+  // cache_k, [B, num_head, head_dim / x, max_seq_len, x]
+  // x == 4/8 for FP32/FP16, 128bit, 16Byte
+  constexpr int QK_ELTS_IN_16B = 16 / sizeof(T);
+  constexpr int QK_VECS_IN_16B = 16 / sizeof(Qk_vec);
+
+  const T *q_base = params.qkv;
+  const T *k_base = params.qkv + params.num_head * Dh;
+  const T *q_bias_base = params.qkv_bias;
+  const T *k_bias_base = params.qkv_bias + params.num_head * Dh;
+
+  if (tid < QK_VECS_PER_WARP) {
+    int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
+    int qk_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
+
+    Qk_vec q = *reinterpret_cast<const Qk_vec *>(&q_base[qk_offset]);
+    Qk_vec k = *reinterpret_cast<const Qk_vec *>(&k_base[qk_offset]);
+
+    Qk_vec q_bias =
+        *reinterpret_cast<const Qk_vec *>(&q_bias_base[qk_bias_offset]);
+    Qk_vec k_bias =
+        *reinterpret_cast<const Qk_vec *>(&k_bias_base[qk_bias_offset]);
+
+    q = add(q, q_bias);
+    // TODO(wangxi): See this https://github.com/microsoft/unilm/issues/510
+    //   we may not require k_bias.
+    k = add(k, k_bias);
+
+    *reinterpret_cast<Qk_vec *>(&q_smem[tid * QK_VEC_SIZE]) = q;
+
+    int co = tid / QK_VECS_IN_16B;
+    int ci = (tid % QK_VECS_IN_16B) * QK_VEC_SIZE;
+    int offset = bhi * params.max_seq_length * Dh +
+                 co * params.max_seq_length * QK_ELTS_IN_16B +
+                 params.timestep * QK_ELTS_IN_16B + ci;
+    *reinterpret_cast<Qk_vec *>(&params.cache_kv[offset]) = k;
+
+    float qk = dot<Qk_vec, Qk_vec>(q, k);
+#pragma unroll
+    for (int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2) {
+      qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask);
+    }
+
+    qk *= params.inv_sqrt_dh;
+    if (tid == 0) {
+      // NOTE(wangxi): mask must be 0.0
+      // T mask = params.attn_mask[
+      //    bi * (params.timestep + 1) + params.timestep];
+      // qk += static_cast<float>(mask);
+      qk_max = qk;
+      qk_smem[params.timestep] = qk;
+    }
+  }
+  __syncthreads();
+
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+  if (bi == 0 && hi == 0 && tid == 0) {
+    printf("=======q_out=======\n");
+    for (int i = 0; i < Dh; ++i) printf("%f ", static_cast<float>(q_smem[i]));
+    printf("\n");
+  }
+  __syncthreads();
+#endif
+
+  using K_vec = typename K_vec_<T, THREADS_PER_KEY>::Type;
+  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(T);
+  static_assert(Dh % K_VEC_SIZE == 0, "");
+  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
+  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+
+  int ko = tid / THREADS_PER_KEY;
+  int ki = (tid % THREADS_PER_KEY) * K_VEC_SIZE;
+
+  K_vec q[K_VECS_PER_THREAD];
+#pragma unroll
+  for (int i = 0; i < K_VECS_PER_THREAD; ++i) {
+    q[i] = *reinterpret_cast<const K_vec *>(
+        &q_smem[ki + i * THREADS_PER_KEY * K_VEC_SIZE]);
+  }
+
+  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+  T *k_cache = &params.cache_kv[bhi * params.max_seq_length * Dh + ki];
+  int ti_end = div_up(params.timestep, K_PER_WARP) * K_PER_WARP;
+
+  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+    K_vec k[K_VECS_PER_THREAD];
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+      int jj = ii * params.max_seq_length + ti;
+      if (ti < params.timestep) {
+        k[ii] = *reinterpret_cast<const K_vec *>(&k_cache[jj * QK_ELTS_IN_16B]);
+      }
+    }
+
+    float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q, k) * params.inv_sqrt_dh;
+
+    // bool is_mask = false;
+    if (ti < params.timestep && tid % THREADS_PER_KEY == 0) {
+      // qk_max = is_mask ? qk_max : fmaxf(qk_max, qk);
+      T mask = params.attn_mask[bi * (params.timestep + 1) + ti];
+      qk += static_cast<float>(mask);
+      qk_max = fmaxf(qk_max, qk);
+
+      qk_smem[ti] = qk;
+    }
+  }
+
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+  }
+
+  const int warp = tid / WARP_SIZE;
+  const int lane = tid % WARP_SIZE;
+
+  if (lane == 0) {
+    red_smem[warp] = qk_max;
+  }
+
+  __syncthreads();
+
+  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+  }
+
+  qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
+
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+  if (bi == 0 && hi == 0 && tid == 0) {
+    printf("=======qk_out=======\n");
+    for (int i = 0; i <= params.timestep; ++i) printf("%f ", qk_smem[i]);
+    printf("qk_max=%f\n", qk_max);
+  }
+  __syncthreads();
+#endif
+
+  float sum = 0.f;
+  for (int ti = tid; ti <= params.timestep; ti += THREADS_PER_BLOCK) {
+    // bool is_mask = false;
+    // float logit = is_mask ? 0.f : __expf(qk_smem[ti] - qk_max);
+    float logit = __expf(qk_smem[ti] - qk_max);
+    sum += logit;
+    qk_smem[ti] = logit;
+  }
+
+  sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);
+
+  // FIXME(wangxi): need add 1.e-6f?
+  float inv_sum = __fdividef(1.f, sum + 1.e-6f);
+  for (int ti = tid; ti <= params.timestep; ti += THREADS_PER_BLOCK) {
+    convert_from_float(logits_smem[ti], qk_smem[ti] * inv_sum);
+  }
+  __syncthreads();
+
+  constexpr int V_VEC_SIZE = Dh / THREADS_PER_VALUE;
+  using V_vec = typename V_vec_<T, V_VEC_SIZE>::Type;
+
+  int vo = tid / THREADS_PER_VALUE;
+  int vi = (tid % THREADS_PER_VALUE) * V_VEC_SIZE;
+
+  T *v_cache = &params.cache_kv[params.batch_size * params.num_head *
+                                    params.max_seq_length * Dh +
+                                bhi * params.max_seq_length * Dh + vi];
+
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+  using V_vec_acum = typename V_vec_acum_fp32_<V_vec>::Type;
+#else
+  using V_vec_acum = V_vec;
+#endif
+
+  V_vec_acum out;
+  zero(out);
+
+  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+  for (int ti = vo; ti < params.timestep; ti += V_PER_ITER) {
+    V_vec v = *reinterpret_cast<const V_vec *>(&v_cache[ti * Dh]);
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+    float logit = logits_smem[ti];
+    out = fma(logit, cast_to_float(v), out);
+#else
+    T logit = logits_smem[ti];
+    // Update the partial sums.
+    out = fma(logit, v, out);
+#endif
+  }
+
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+  if (bi == 0 && hi == 0 && tid == 0) {
+    printf("======logits_out=====\n");
+    for (int i = 0; i <= params.timestep; ++i) printf("%f ", logits_smem[i]);
+    printf("\n");
+  }
+  __syncthreads();
+#endif
+
+  if (vo == (params.timestep % V_PER_ITER)) {
+    V_vec v = *reinterpret_cast<const V_vec *>(
+        &params.qkv[2 * params.num_head * Dh + qkv_base_offset + vi]);
+    V_vec v_bias = *reinterpret_cast<const V_vec *>(
+        &params.qkv_bias[2 * params.num_head * Dh + hi * Dh + vi]);
+    v = add(v, v_bias);
+    *reinterpret_cast<V_vec *>(&v_cache[params.timestep * Dh]) = v;
+
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+    out = fma(logits_smem[params.timestep], cast_to_float(v), out);
+#else
+    out = fma(logits_smem[params.timestep], v, out);
+#endif
+  }
+
+  __syncthreads();
+
+#pragma unroll
+  for (int active_groups = V_PER_ITER; active_groups >= 2; active_groups /= 2) {
+    int midpoint = active_groups / 2;
+
+    if (vo >= midpoint && vo < active_groups) {
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+      convert_from_float(
+          *reinterpret_cast<V_vec *>(&out_smem[(vo - midpoint) * Dh + vi]),
+          out);
+#else
+      *reinterpret_cast<V_vec *>(&out_smem[(vo - midpoint) * Dh + vi]) = out;
+#endif
+    }
+    __syncthreads();
+    if (vo < midpoint) {
+      out = add(*reinterpret_cast<const V_vec *>(&out_smem[vo * Dh + vi]), out);
+    }
+    __syncthreads();
+  }
+
+  if (vo == 0) {
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+    convert_from_float(*reinterpret_cast<V_vec *>(&params.out[bhi * Dh + vi]),
+                       out);
+#else
+    *reinterpret_cast<V_vec *>(&params.out[bhi * Dh + vi]) = out;
+#endif
+  }
+
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+  __syncthreads();
+  if (bi == 0 && hi == 0 && tid == 0) {
+    printf("======fmha_out=====\n");
+    for (int i = 0; i < Dh; ++i)
+      printf("%f ", static_cast<float>(params.out[i]));
+    printf("\n");
+  }
+#endif
+#else
+  assert(false);
+#endif
+}
+
+template <typename T>
+inline size_t smem_size_in_bytes(
+    const Masked_multihead_attention_params<T> &params, int dim_head,
+    int threads_per_value, int threads_per_block) {
+  size_t qk_sz = div_up(params.timestep + 1, 4) * 16;
+  size_t logits_sz = 0;
+
+#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS
+  if (sizeof(T) != 4) {
+    logits_sz = div_up(params.max_seq_length, 4) * 4 * sizeof(T);
+  }
+#endif
+  size_t softmax_sz = qk_sz + logits_sz;
+
+  int rows_per_red = threads_per_block / threads_per_value;
+  size_t red_sz = rows_per_red * dim_head * sizeof(T) / 2;
+
+  return max(softmax_sz, red_sz);
+}
+
+#define MMHA_LAUNCH_KERNEL(T, Dh, THDS_PER_KEY, THDS_PER_VALUE,          \
+                           THDS_PER_BLOCK, stream)                       \
+  size_t smem_sz =                                                       \
+      smem_size_in_bytes<T>(params, Dh, THDS_PER_VALUE, THDS_PER_BLOCK); \
+  dim3 grid(params.num_head, params.batch_size);                         \
+  masked_multihead_attention_kernel<                                     \
+      T, Dh, THDS_PER_KEY, THDS_PER_VALUE,                               \
+      THDS_PER_BLOCK><<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
+
+template <typename T, int Dh>
+void fmha_launch_kernel(const Masked_multihead_attention_params<T> &params,
+                        const cudaStream_t &stream) {
+  constexpr int THREADS_PER_VALUE = Dh * sizeof(T) / 16;
+  if (params.timestep < 32) {
+    MMHA_LAUNCH_KERNEL(T, Dh, 4, THREADS_PER_VALUE, 64, stream);
+  } else if (params.timestep < 2048) {
+    MMHA_LAUNCH_KERNEL(T, Dh, 2, THREADS_PER_VALUE, 128, stream);
+  } else {
+    MMHA_LAUNCH_KERNEL(T, Dh, 1, THREADS_PER_VALUE, 256, stream);
+  }
+}
+
+template <typename T>
+void fmha(const platform::CUDADeviceContext &dev_ctx, const Tensor &qkv_tensor,
+          const Tensor &qkv_bias_tensor, const Tensor &src_mask_tensor,
+          Tensor *cache_kv_tensor, Tensor *out_tensor, int batch_size,
+          int max_seq_length, int num_head, int dim_head, int timestep,
+          float inv_sqrt_dh) {
+  Masked_multihead_attention_params<T> params;
+  params.out = out_tensor->data<T>();
+  params.qkv = qkv_tensor.data<T>();
+  params.qkv_bias = qkv_bias_tensor.data<T>();
+  params.attn_mask = src_mask_tensor.data<T>();
+  params.cache_kv = cache_kv_tensor->data<T>();
+
+  params.batch_size = batch_size;
+  params.num_head = num_head;
+  params.timestep = timestep;
+  params.max_seq_length = max_seq_length;
+  params.inv_sqrt_dh = inv_sqrt_dh;
+
+  switch (dim_head) {
+    case 32:
+      fmha_launch_kernel<T, 32>(params, dev_ctx.stream());
+      break;
+    case 64:
+      fmha_launch_kernel<T, 64>(params, dev_ctx.stream());
+      break;
+    case 128:
+      fmha_launch_kernel<T, 128>(params, dev_ctx.stream());
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "dim_head = %d is unsupport, only support "
+          "dim_head = 32, 64 or 128 for now.",
+          dim_head));
+  }
+}
+
+// NOTE: simd with 16Bytes(128bit), float is 4, float16 is 8
+constexpr int VEC_16B = 16;
+
+template <typename T>
+__global__ void write_cache_k_kernel(T *cache_k, const T *k, const int num_head,
+                                     const int dim_head, const int seq_len,
+                                     const int max_seq_len) {
+  const int bi = blockIdx.y;
+  const int hi = blockIdx.z;
+  constexpr int X_ELEMS = VEC_16B / sizeof(T);
+
+  // [bsz, num_head, seq_len, dim_head/x, x]
+  auto k_src = reinterpret_cast<const uint4 *>(
+      k + bi * num_head * seq_len * dim_head + hi * seq_len * dim_head);
+  // [bsz, num_head, dim_head/x, max_seq_len, x]
+  auto k_dst = reinterpret_cast<uint4 *>(
+      cache_k + bi * num_head * max_seq_len * dim_head +
+      hi * max_seq_len * dim_head);
+
+  const int out_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // vec size
+  int dim_head_div_x = dim_head / X_ELEMS;
+
+  // FIXME(wangxi): num_head is not need?
+  // if (out_idx >= num_head * dim_head_div_x * max_seq_len) return;
+  if (out_idx >= dim_head_div_x * max_seq_len) return;
+
+  int idx = out_idx;
+  const int k_seq_len_id = idx % max_seq_len;
+  // idx = (idx - k_seq_len_id) / max_seq_len;
+  idx = idx / max_seq_len;
+  const int k_vec_id = idx % dim_head_div_x;
+
+  if (k_seq_len_id < seq_len) {
+    k_dst[out_idx] = k_src[k_seq_len_id * dim_head_div_x + k_vec_id];
+  }
+}
+
+template <typename T>
+__global__ void write_cache_v_kernel(T *cache_v, const T *v, const int num_head,
+                                     const int dim_head, const int seq_len,
+                                     const int max_seq_len) {
+  const int bi = blockIdx.y;
+  const int hi = blockIdx.z;
+
+  // [bsz, num_head, seq_len, dim_head/x, x]
+  auto v_src = reinterpret_cast<const uint4 *>(
+      v + bi * num_head * seq_len * dim_head + hi * seq_len * dim_head);
+  // [bsz, num_head, max_seq_len, dim_head/x, x]
+  auto v_dst = reinterpret_cast<uint4 *>(
+      cache_v + bi * num_head * max_seq_len * dim_head +
+      hi * max_seq_len * dim_head);
+
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  constexpr int X_ELEMS = VEC_16B / sizeof(T);
+  const int dim_head_div_x = dim_head / X_ELEMS;
+
+  if (idx >= dim_head_div_x * seq_len) return;
+
+  v_dst[idx] = v_src[idx];
+}
+
+template <typename T>
+void write_cache_kv(const platform::CUDADeviceContext &dev_ctx, T *cache_k,
+                    T *cache_v, const T *k, const T *v, const int bsz,
+                    const int num_head, const int seq_len,
+                    const int max_seq_len, const int dim_head) {
+  constexpr int block_sz = 128;
+  constexpr int x = VEC_16B / sizeof(T);
+
+  assert(dim_head % x == 0);
+  PADDLE_ENFORCE_EQ(
+      dim_head % x, 0,
+      platform::errors::PreconditionNotMet(
+          "dim_head=%d must be divisible by vec_size=%d", dim_head, x));
+
+  int max_size = max_seq_len * dim_head / x;
+  int size = seq_len * dim_head / x;
+  dim3 grid(div_up(max_size, block_sz), bsz, num_head);
+  dim3 grid_v(div_up(size, block_sz), bsz, num_head);
+
+  // transpose [bsz, num_head, seq_len, dim_head/x, x]->
+  // [bsz, num_head, dim_head/x, max_seq_len, x]
+  write_cache_k_kernel<<<grid, block_sz, 0, dev_ctx.stream()>>>(
+      cache_k, k, num_head, dim_head, seq_len, max_seq_len);
+
+  // copy [bsz, num_head, seq_len, dim_head/x, x]->
+  // [bsz, num_head, max_seq_len, dim_head/x, x]
+  write_cache_v_kernel<<<grid_v, block_sz, 0, dev_ctx.stream()>>>(
+      cache_v, v, num_head, dim_head, seq_len, max_seq_len);
+}
+
+}  // namespace
+
+template <typename T>
+class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    auto place = ctx.GetPlace();
+    auto &dev_ctx = ctx.cuda_device_context();
+
+    auto *time_step = ctx.Input<Tensor>("TimeStep");
+    // 0. input
+    auto *input_x = ctx.Input<Tensor>("X");
+    const auto input_x_dims = input_x->dims();
+    int bsz = input_x_dims[0];
+    int seq_len = input_x_dims[1];
+    int dim_embed = input_x_dims[2];
+    int bsz_seq = bsz * seq_len;
+
+    // 1. layer norm
+    const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto ln_scales = ctx.MultiInput<Tensor>("LnScale");
+    auto ln_biases = ctx.MultiInput<Tensor>("LnBias");
+
+    auto ln_compute = AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
+    Tensor ln_mean, ln_var;
+    auto *ln_mean_data = ln_mean.mutable_data<U>({bsz_seq}, place);
+    auto *ln_var_data = ln_var.mutable_data<U>({bsz_seq}, place);
+
+    // 2. qkv
+    // x: qkv's input [batch_size, seq_len, dim_embed]
+    // y: qkv's weight: [3, num_head, dim_head, dim_embed]
+    auto qkv_weights = ctx.MultiInput<Tensor>("QKVW");
+    auto qkv_biases = ctx.MultiInput<Tensor>("QKVBias");
+    const auto qkv_w_dims = qkv_weights[0]->dims();
+    int num_head = qkv_w_dims[1];
+    int dim_head = qkv_w_dims[2];
+    int hidden_size = num_head * dim_head;
+    int output_size = 3 * hidden_size;
+    int input_size = dim_embed;
+
+    bool compute_bias = qkv_biases.size() > 0 && time_step == nullptr;
+    // (transA, transB, compute_bias) = (false, true, false)
+    auto qkv_compute = AttnMatMul<T>(dev_ctx, false, true, bsz_seq, output_size,
+                                     input_size, compute_bias);
+    Tensor qkv_out;
+    auto *qkv_out_data =
+        qkv_out.mutable_data<T>({bsz, seq_len, 3, num_head, dim_head}, place);
+
+    // 3. fmha
+    AttnDropoutParam attn_param(true, "upscale_in_train", 0.0, true, true, 0,
+                                nullptr);
+    auto fmha_compute =
+        FMHARef<T>(dev_ctx, bsz, seq_len, num_head, dim_head, attn_param);
+    auto *src_mask = ctx.Input<Tensor>("SrcMask");
+    auto cache_kvs = ctx.MultiInput<Tensor>("CacheKV");
+    auto cache_kv_outs = ctx.MultiOutput<Tensor>("CacheKVOut");
+    // auto *time_step = ctx.Input<Tensor>("TimeStep");
+
+    auto out_seq_len = seq_len;
+    if (time_step) {
+      PADDLE_ENFORCE_EQ(time_step->place(), platform::CPUPlace(),
+                        platform::errors::PreconditionNotMet(
+                            "The place of input(TimeStep) must be CPUPlace."));
+      // cache_seq_len
+      int time_step_value = time_step->data<int>()[0];
+      PADDLE_ENFORCE_GT(time_step_value, 0,
+                        platform::errors::PreconditionNotMet(
+                            "The value of time_step must > 0, but now is %d",
+                            time_step_value));
+      PADDLE_ENFORCE_EQ(
+          seq_len, 1,
+          platform::errors::PreconditionNotMet(
+              "In decode stage, the seq_len of input must be 1, but now is %d",
+              seq_len));
+      out_seq_len += time_step_value;
+    }
+
+    Tensor transpose_out_2, qk_out;
+    auto *transpose_out_2_data = transpose_out_2.mutable_data<T>(
+        {3, bsz, num_head, seq_len, dim_head}, place);
+    auto *qk_out_data =
+        qk_out.mutable_data<T>({bsz, num_head, seq_len, out_seq_len}, place);
+
+    Tensor src_mask_out, softmax_out;
+    Tensor attn_dropout_mask_out, attn_dropout_out;
+    Tensor qktv_out, fmha_out;
+    auto *src_mask_out_data = src_mask_out.mutable_data<T>(
+        {bsz, num_head, seq_len, out_seq_len}, place);
+    auto *softmax_out_data = softmax_out.mutable_data<T>(
+        {bsz, num_head, seq_len, out_seq_len}, place);
+
+    auto *attn_dropout_mask_out_data = attn_dropout_mask_out.mutable_data<T>(
+        {bsz, num_head, seq_len, out_seq_len}, place);
+    auto *attn_dropout_data_data = attn_dropout_out.mutable_data<T>(
+        {bsz, num_head, seq_len, out_seq_len}, place);
+
+    auto *qktv_out_data =
+        qktv_out.mutable_data<T>({bsz, num_head, seq_len, dim_head}, place);
+    auto *fmha_out_data =
+        fmha_out.mutable_data<T>({bsz, seq_len, num_head, dim_head}, place);
+
+    // 4. out_linear
+    auto out_linear_weights = ctx.MultiInput<Tensor>("OutLinearW");
+    auto out_linear_biases = ctx.MultiInput<Tensor>("OutLinearBias");
+    int ring_id = ctx.Attr<int>("ring_id");
+    // (transA, transB, compute_bias) = (false, false, false)
+    auto out_linear_compute = AttnMatMul<T>(dev_ctx, false, false, bsz_seq,
+                                            dim_embed, hidden_size, false);
+
+    // 5. ln(residual + bias)
+    DropoutParam dropout_param2(true, 0, true, true, 0.0, nullptr, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
+    auto ffn_ln_scales = ctx.MultiInput<Tensor>("FFNLnScale");
+    auto ffn_ln_biases = ctx.MultiInput<Tensor>("FFNLnBias");
+    Tensor bias_dropout_residual_out, dropout_mask_out;
+    auto *bias_dropout_residual_out_data =
+        bias_dropout_residual_out.mutable_data<T>({bsz, seq_len, dim_embed},
+                                                  place);
+    auto *dropout_mask_out_data = dropout_mask_out.mutable_data<uint8_t>(
+        {bsz, seq_len, dim_embed}, place);
+
+    // 6. ffn matmul1
+    auto ffn1_weights = ctx.MultiInput<Tensor>("FFN1Weight");
+    auto ffn1_biases = ctx.MultiInput<Tensor>("FFN1Bias");
+    auto ffn1_weight_dim = ffn1_weights[0]->dims();
+
+    int dim_ffn = ffn1_weight_dim[1];
+    auto ffn1_linear_compute = AttnMatMul<T>(dev_ctx, false, false, bsz_seq,
+                                             dim_ffn, dim_embed, false);
+    Tensor ffn1_out;
+    auto *ffn1_out_data = ffn1_out.mutable_data<T>({bsz_seq, dim_ffn}, place);
+
+    // 7. ffn act + bias
+    DropoutParam ffn1_dropout_param(true, 0, true, true, 0.0, nullptr, 0);
+    FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
+        dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param);
+    Tensor ffn1_dropout_out, ffn1_dropout_mask;
+    auto *ffn1_dropout_out_data =
+        ffn1_dropout_out.mutable_data<T>({bsz_seq, dim_ffn}, place);
+    auto *ffn1_dropout_mask_data =
+        ffn1_dropout_mask.mutable_data<uint8_t>({bsz_seq, dim_ffn}, place);
+
+    // 8. ffn2 matmul
+    auto ffn2_weights = ctx.MultiInput<Tensor>("FFN2Weight");
+    auto ffn2_biases = ctx.MultiInput<Tensor>("FFN2Bias");
+    auto ffn2_linear_compute = AttnMatMul<T>(dev_ctx, false, false, bsz_seq,
+                                             dim_embed, dim_ffn, false);
+
+    // 9. ffn2 residual bias
+    DropoutParam ffn2_dropout_param(true, 0, true, true, 0.0, nullptr, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> ffn2_fused_dropout_helper(
+        dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon);
+
+    // calc
+    auto *out = ctx.Output<Tensor>("Out");
+    auto *from_data = out->mutable_data<T>(place);
+    Tensor *from_tensor = out;
+    Tensor tmp_out;
+    auto *tmp_out_data =
+        tmp_out.mutable_data<T>({bsz, seq_len, dim_embed}, place);
+
+    auto *x_data = input_x->data<T>();
+    Tensor *buf0 = nullptr;
+    Tensor *buf1 = nullptr;
+
+    // step0:  x   --> buf1
+    // step1: buf1 --> buf0
+    // step2: buf0 --> buf1
+    int layers = qkv_weights.size();
+    if (layers & 1) {
+      // odd, set buf1 as out
+      buf0 = &tmp_out;
+      buf1 = out;
+    } else {
+      // even, set buf0 as out
+      buf0 = out;
+      buf1 = &tmp_out;
+    }
+
+    for (int i = 0; i < layers; ++i) {
+      // step1. layer_norm
+      if (i == 0 && pre_layer_norm) {
+        auto *ln_scale_data = ln_scales[i]->data<U>();
+        auto *ln_bias_data = ln_biases[i]->data<U>();
+        // TODO(wangxi): can remove mean var in inference
+        ln_compute.ComputeForward(x_data, ln_scale_data, ln_bias_data,
+                                  buf1->data<T>(), ln_mean_data, ln_var_data);
+      } else if (!pre_layer_norm) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unimplemented post_layer_norm for now."));
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step1";
+#endif
+
+      // step2. qkv
+      const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
+      // NOTE: in decoder stage, bias is fused in fmha
+      const Tensor *bias = time_step ? nullptr : qkv_bias;
+      qkv_compute.ComputeForward(qkv_weights[i], buf1, bias, &qkv_out,
+                                 &qkv_out);
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step2";
+#endif
+
+      // step3. fmha
+      const Tensor *cache_kv = cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
+      Tensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
+
+      if (time_step) {  // generation decoder stage
+        // [2, batch_size, num_head, max_seq_len, head_size]
+        int max_seq_len = cache_kv->dims()[3];
+        fmha<T>(dev_ctx, qkv_out, *qkv_bias, *src_mask, cache_kv_out, &fmha_out,
+                bsz, max_seq_len, num_head, dim_head, time_step->data<int>()[0],
+                1. / sqrt(dim_head));
+      } else if (cache_kv_out) {  // generation context stage
+        // TODO(wangxi): can remove dropout in inference
+        fmha_compute.ComputeForward(
+            qkv_out, nullptr, src_mask, &transpose_out_2, nullptr, &qk_out,
+            &src_mask_out, &softmax_out, &attn_dropout_mask_out,
+            &attn_dropout_out, &qktv_out, &fmha_out);
+        // [3, bsz, num_head, seq_len, head_dim]
+        T *qkv_data = transpose_out_2_data;
+        int64_t q_size = bsz * seq_len * num_head * dim_head;
+        int64_t k_size = q_size;
+        const T *q_ptr = qkv_data;
+        const T *k_ptr = q_ptr + q_size;
+        const T *v_ptr = k_ptr + k_size;
+
+        // [2, bsz, num_head, max_seq_len, head_dim]
+        int max_seq_len = cache_kv_out->dims()[3];
+        T *cache_kv_data = cache_kv_out->data<T>();
+        int64_t cache_k_size = bsz * num_head * max_seq_len * dim_head;
+
+        T *cache_k_ptr = cache_kv_data;
+        T *cache_v_ptr = cache_kv_data + cache_k_size;
+
+        write_cache_kv<T>(dev_ctx, cache_k_ptr, cache_v_ptr, k_ptr, v_ptr, bsz,
+                          num_head, seq_len, max_seq_len, dim_head);
+      } else {  // not generation
+        // TODO(wangxi): can remove dropout in inference
+        fmha_compute.ComputeForward(
+            qkv_out, cache_kv, src_mask, &transpose_out_2, cache_kv_out,
+            &qk_out, &src_mask_out, &softmax_out, &attn_dropout_mask_out,
+            &attn_dropout_out, &qktv_out, &fmha_out);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step3";
+#endif
+
+      // step4. out_linear
+      out_linear_compute.ComputeForward(out_linear_weights[i], &fmha_out,
+                                        nullptr, buf1, nullptr);
+      AllReduce<T>(*buf1, ring_id, dev_ctx);
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step4";
+#endif
+
+      // step5. ln(residual + dropout(input + bias))
+      if (pre_layer_norm) {
+        auto *ln_scale_data = ffn_ln_scales[i]->data<U>();
+        auto *ln_bias_data = ffn_ln_biases[i]->data<U>();
+        auto *out_linear_bias_data = out_linear_biases[i]->data<T>();
+
+        // inplace
+        fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
+            dev_ctx, buf1->data<T>(), x_data, out_linear_bias_data,
+            ln_scale_data, ln_bias_data, bias_dropout_residual_out_data,
+            dropout_mask_out_data, buf1->data<T>(), ln_mean_data, ln_var_data);
+      } else {
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step5";
+#endif
+
+      // step6. ffn matmul1
+      ffn1_linear_compute.ComputeForward(ffn1_weights[i], buf1, nullptr,
+                                         &ffn1_out, nullptr);
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step6";
+#endif
+
+      // step7. act bias
+      // TODO(wangxi): remove dropout mask in inference
+      fused_act_dropout_helper.DropoutActBias(
+          dev_ctx, ffn1_out_data, ffn1_biases[i]->data<T>(), "gelu",
+          ffn1_dropout_out_data, ffn1_dropout_mask_data);
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step7";
+#endif
+
+      // step8. ffn matmul2
+      ffn2_linear_compute.ComputeForward(ffn2_weights[i], &ffn1_dropout_out,
+                                         nullptr, buf1, nullptr);
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step8.0";
+#endif
+
+      AllReduce<T>(*buf1, ring_id, dev_ctx);
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step8.1";
+#endif
+
+      // step9. residual bias
+      if (pre_layer_norm) {
+        // TODO(wangxi): remove dropout mask in inference
+        if (i < layers - 1) {
+          auto *ln_scale_data = ln_scales[i + 1]->data<U>();
+          auto *ln_bias_data = ln_biases[i + 1]->data<U>();
+          ffn2_fused_dropout_helper.LayernormResidualDropoutBias(
+              dev_ctx, buf1->data<T>(), bias_dropout_residual_out_data,
+              ffn2_biases[i]->data<T>(), ln_scale_data, ln_bias_data,
+              buf1->data<T>(), dropout_mask_out_data, buf0->data<T>(),
+              ln_mean_data, ln_var_data);
+        } else {
+          ffn2_fused_dropout_helper.ResidualDropoutBias(
+              dev_ctx, buf1->data<T>(), bias_dropout_residual_out_data,
+              ffn2_biases[i]->data<T>(), buf1->data<T>(),
+              dropout_mask_out_data);
+        }
+      } else {
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step9";
+#endif
+      x_data = buf1->data<T>();
+      std::swap(buf0, buf1);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(fused_multi_transformer,
+                        ops::FusedMultiTransformerOpKernel<plat::float16>,
+                        ops::FusedMultiTransformerOpKernel<float>);
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 5dff5e2225f4f..caceac1228e0a 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias {
                  dropout_prob, is_upscale_in_train, is_test);
     }
     ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     // add residual
     for (int i = 0; i < rows; i++) {
       for (int j = 0; j < cols; j++) {
@@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias {
         src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
         out.data<T>(), *ctx);
     ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
   }
 
   void FusedBackward() {
@@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) {
   test.CheckOut(static_cast<float>(1e-5));
   test.CheckGrad(static_cast<float>(1e-3));
 }
+
+TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) {
+  // Used to test that `cudaErrorLaunchOutOfResources` will not occur
+  int rows = 1;
+  int cols = 12288;
+  if (std::getenv("_rows") != nullptr) {
+    rows = atoi(std::getenv("_rows"));
+  }
+  if (std::getenv("_cols") != nullptr) {
+    cols = atoi(std::getenv("_cols"));
+  }
+  TestFusedResidualDropoutBias<platform::float16> test(rows, cols, 0, 0.0, true,
+                                                       true);
+  test.Run();
+  test.CheckOut(static_cast<platform::float16>(1e-1));
+  test.CheckGrad(static_cast<platform::float16>(1e-1));
+}
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 8f98a0b9fbee8..5b499b8985f4f 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -29,18 +29,10 @@ class LayerNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT<
                          const dnnl::engine engine, platform::Place cpu_place)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::layer_normalization_forward>(
             engine, cpu_place) {
-    if (!is_test) {
-      // TODO(grygielski) Delete forcing stats_md after DNNL 1.2 is introduced
-      auto stats_md = dnnl::memory::desc(
-          {begin(dims), end(dims) - 1}, platform::MKLDNNGetDataType<float>(),
-          platform::GetPlainMKLDNNFormat(dims.size() - 1));
-      this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
-                                              x->mem_desc(), stats_md, epsilon,
-                                              flags);
-    } else {
-      this->AcquireForwardPrimitiveDescriptor(
-          dnnl::prop_kind::forward_inference, x->mem_desc(), epsilon, flags);
-    }
+    const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference
+                                       : dnnl::prop_kind::forward_training;
+    this->AcquireForwardPrimitiveDescriptor(fwd_prop_kind, x->mem_desc(),
+                                            epsilon, flags);
   }
 
   std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(const Tensor* scale,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 793aa2644b548..eacab46800580 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -44,6 +44,32 @@ bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type) {
   return false;
 }
 
+const std::shared_ptr<MLUCnnlRandomGeneratorDesc>& GetMLURandomGenerator(
+    const ExecutionContext& ctx, const int64_t device_id, const int seed) {
+  static int64_t num_mlu_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> mlu_device_flags;
+  static std::vector<std::shared_ptr<MLUCnnlRandomGeneratorDesc>>
+      mlu_rand_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_mlu_devices = paddle::platform::GetMLUDeviceCount();
+    mlu_device_flags.resize(num_mlu_devices);
+    mlu_rand_generators.resize(num_mlu_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "mlu device id shoule be greater than 0"));
+  }
+
+  std::call_once(mlu_device_flags[device_id], [&]() {
+    mlu_rand_generators[device_id].reset(
+        new MLUCnnlRandomGeneratorDesc(ctx, seed));
+    VLOG(4) << "device_id: " << device_id << ", initial seed: " << seed;
+  });
+  return mlu_rand_generators[device_id];
+}
+
 class MLUCnnlTensorDescPool {
  public:
   cnnlTensorDescriptor_t Pop() {
@@ -266,23 +292,32 @@ MLUCnnlPoolingDesc::~MLUCnnlPoolingDesc() {
   }
 }
 
-MLUCnnlRandomGeneratorDesc::MLUCnnlRandomGeneratorDesc(const bool is_mlu200,
-                                                       const int seed) {
-  if (is_mlu200) {
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_FAST));
-  } else {
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_MTGP32));
-    PADDLE_ENFORCE_MLU_SUCCESS(
-        cnnlRandSetPseudoRandomGeneratorSeed(mlu_generator, seed));
-  }
+MLUCnnlRandomGeneratorDesc::MLUCnnlRandomGeneratorDesc(
+    const ExecutionContext& ctx, const int seed) {
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_MTGP32));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlRandSetPseudoRandomGeneratorSeed(mlu_generator, seed));
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlRandGetMTGP32StateSize(mlu_generator, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  mlu_state = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* mlu_state_ptr = mlu_state.mutable_data(ctx.GetPlace());
+
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandMakeMTGP32KernelState(
+      handle, mlu_state_ptr, nullptr, nullptr, seed));
 }
 
 const cnnlRandGenerator_t MLUCnnlRandomGeneratorDesc::get() const {
   return mlu_generator;
 }
 
+Tensor& MLUCnnlRandomGeneratorDesc::get_state() { return mlu_state; }
+
 MLUCnnlRandomGeneratorDesc::~MLUCnnlRandomGeneratorDesc() {
   if (mlu_generator) {
     PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandDestroyGenerator(mlu_generator));
@@ -947,6 +982,26 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       workspace_ptr, workspace_size, beta_ptr, output_desc, output));
 }
 
+/* static */ void MLUCnnl::MulAx(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t alpha_desc,
+                                 const void* alpha,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetAxWorkspaceSize(handle, alpha_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlAx_v2(handle, alpha_desc, alpha, output_desc,
+                                       output, workspace_ptr, workspace_size));
+}
+
 /* static */ void MLUCnnl::BiasAddGrad(
     const ExecutionContext& ctx, const int axis,
     const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
@@ -959,12 +1014,23 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
 
 /* static */ void MLUCnnl::RandomUniform(
     const ExecutionContext& ctx, const int num, const cnnlDataType_t data_type,
-    const cnnlRandGenerator_t mlu_generator, const float min, const float max,
-    void* output) {
+    const cnnlRandGenerator_t mlu_generator, void* mlu_state, void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandGenerateUniform(
-      handle, mlu_generator, data_type, nullptr, num, min, max, output));
+      handle, mlu_generator, data_type, mlu_state, num, 0, 1, output));
+}
+
+/* static */ void MLUCnnl::FusedDropout(
+    const ExecutionContext& ctx, const cnnlRandGenerator_t generator,
+    const cnnlTensorDescriptor_t input_desc, const void* input, const float p,
+    void* state, const cnnlTensorDescriptor_t mask_desc, const void* mask,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlFusedDropout_v2(handle, generator, input_desc,
+                                                 input, p, state, mask_desc,
+                                                 mask, output_desc, output));
 }
 
 /* static */ void MLUCnnl::TopK(
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 9948c45e24692..572b7aa2bbd01 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -273,14 +273,19 @@ class MLUCnnlPoolingDesc {
 
 class MLUCnnlRandomGeneratorDesc {
  public:
-  MLUCnnlRandomGeneratorDesc(const bool is_mlu200, const int seed);
+  MLUCnnlRandomGeneratorDesc(const ExecutionContext& ctx, const int seed);
   const cnnlRandGenerator_t get() const;
+  Tensor& get_state();
   ~MLUCnnlRandomGeneratorDesc();
 
  private:
+  Tensor mlu_state;
   cnnlRandGenerator_t mlu_generator = nullptr;
 };
 
+const std::shared_ptr<MLUCnnlRandomGeneratorDesc>& GetMLURandomGenerator(
+    const ExecutionContext& ctx, const int64_t device_id, const int seed);
+
 class MLUCnnlReduceDesc {
  public:
   MLUCnnlReduceDesc(const MLUCnnlReduceDesc& desc) = delete;
@@ -537,7 +542,13 @@ class MLUCnnl {
   static void RandomUniform(const ExecutionContext& ctx, const int num,
                             const cnnlDataType_t data_type,
                             const cnnlRandGenerator_t mlu_generator,
-                            const float min, const float max, void* output);
+                            void* mlu_state, void* output);
+
+  static void FusedDropout(
+      const ExecutionContext& ctx, const cnnlRandGenerator_t generator,
+      const cnnlTensorDescriptor_t input_desc, const void* input, const float p,
+      void* state, const cnnlTensorDescriptor_t mask_desc, const void* mask,
+      const cnnlTensorDescriptor_t output_desc, void* output);
 
   static void Cumsum(const ExecutionContext& ctx, const int axis,
                      const bool exclusive, const bool reverse,
@@ -709,6 +720,10 @@ class MLUCnnl {
       const void* in0, const cnnlTensorDescriptor_t in1_desc, const void* in1,
       const cnnlTensorDescriptor_t output_desc, void* output);
 
+  static void MulAx(const ExecutionContext& ctx,
+                    const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
+                    const cnnlTensorDescriptor_t output_desc, void* output);
+
   static void OpTensor(const ExecutionContext& ctx,
                        const cnnlOpTensorDescriptor_t op_tensor_desc,
                        const cnnlTensorDescriptor_t a_desc, const void* a,
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index 8225dc8e07d6a..36e54d741a04b 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -12,168 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
 
 #include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class AdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const {
-    auto input_data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "Param");
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
-    if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
-        var_name == "SkipUpdate") {
-      return expected_kernel_type;
-    } else {
-      return framework::OpKernelType(expected_kernel_type.data_type_,
-                                     tensor.place(), tensor.layout());
-    }
-  }
-};
-
-class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-    AddInput("Moment1", "(Tensor) Input first moment");
-    AddInput("Moment2", "(Tensor) Input second moment");
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
-
-    AddInput("Beta1Tensor",
-             "(Tensor<float32>, optional) If provided, Adam will use this "
-             "as beta1, this has a higher priority than attr(beta1), the "
-             "shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddInput("Beta2Tensor",
-             "(Tensor<float32>, optional) If provided, Adam will use this "
-             "as beta2, this has a higher priority than attr(beta2), the "
-             "shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddInput("EpsilonTensor",
-             "(Tensor<float32>, optional) If provided, Adam will use this "
-             "as epsilon, this has a higher priority than attr(epsilon), the "
-             "shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
-    AddInput("SkipUpdate", "(Tensor<bool>, optional), Skip the update or not.")
-        .AsDispensable();
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("Moment1Out", "(Tensor) Output first moment");
-    AddOutput("Moment2Out", "(Tensor) Output second moment");
-    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
-    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
-    AddOutput("MasterParamOut",
-              "The updated FP32 master weight for AMP. "
-              "It shared memory with Input(MasterParam).")
-        .AsDispensable();
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "first moment estimates.")
-        .SetDefault(0.9f);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the "
-                   "second moment estimates.")
-        .SetDefault(0.999f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-    AddAttr<bool>(
-        "lazy_mode",
-        "(bool, default false) "
-        "only update the parameter that has gradient in sparse update")
-        .SetDefault(false);
-    AddAttr<int64_t>("min_row_size_to_use_multithread",
-                     "(int64_t, default 0) "
-                     "when not zero, if param row size is larger then "
-                     "min_row_size_to_use_multithread and "
-                     "inner_op_parallelism is larger then 0, sparse update "
-                     "will run in multithread mode")
-        .SetDefault(1000);
-    AddAttr<bool>("multi_precision",
-                  "(bool, default false) "
-                  "Whether to use multi-precision during weight updating.")
-        .SetDefault(false);
-    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
-    // as dispensable since they are not used when use_global_beta_pow is true.
-    AddAttr<bool>("use_global_beta_pow",
-                  "(bool, default false) "
-                  "Whether to use global beta_pow for whole model instead of "
-                  "creating beta_pow for each parameter.")
-        .SetDefault(false);
-
-    AddComment(R"DOC(
-Adam Optimizer.
-
-This implements the Adam optimizer from Section 2 of the Adam
-paper : https://arxiv.org/abs/1412.6980.
-Adam is a first-order gradient-based optimization method based on
-adaptive estimates of lower-order moments.
-
-Adam updates:
-
-$$
-moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-learning\_rate = learning\_rate *
-                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-$$
-
-)DOC");
-  }
-};
-
-class AdamWOp : public AdamOp {
-  using AdamOp::AdamOp;
-};
-
-class AdamWOpMaker : public AdamOpMaker {
- public:
-  void Make() {
-    AdamOpMaker::Make();
-    AddAttr<float>("lr_ratio",
-                   "(float, default 1.0) "
-                   "layerwise learning rate decay")
-        .SetDefault(1.0f);
-    AddAttr<float>("coeff",
-                   "(float, default 0.01) "
-                   "coeff of the weight decay")
-        .SetDefault(0.01f);
-    AddAttr<bool>("with_decay",
-                  "(bool, default false) "
-                  "whether to do weight decay")
-        .SetDefault(false);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 DECLARE_INFER_SHAPE_FUNCTOR(adam, AdamInferMetaFunctor,
@@ -185,14 +30,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     AdamInferMetaFunctor);
 
-DECLARE_INFER_SHAPE_FUNCTOR(adamw, AdamwInferMetaFunctor,
-                            PD_INFER_META(phi::AdamwInferMeta));
-REGISTER_OPERATOR(
-    adamw, ops::AdamWOp, ops::AdamWOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    AdamwInferMetaFunctor);
-
 REGISTER_OP_VERSION(adam)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
new file mode 100644
index 0000000000000..31feaa8102e7a
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class AdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const {
+    auto input_data_type =
+        OperatorWithKernel::IndicateVarDataType(ctx, "Param");
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
+        var_name == "SkipUpdate") {
+      return expected_kernel_type;
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+};
+
+class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment1", "(Tensor) Input first moment");
+    AddInput("Moment2", "(Tensor) Input second moment");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+
+    AddInput("Beta1Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta1, this has a higher priority than attr(beta1), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddInput("Beta2Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta2, this has a higher priority than attr(beta2), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddInput("EpsilonTensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as epsilon, this has a higher priority than attr(epsilon), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
+    AddInput("SkipUpdate", "(Tensor<bool>, optional), Skip the update or not.")
+        .AsDispensable();
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
+    AddOutput("MasterParamOut",
+              "The updated FP32 master weight for AMP. "
+              "It shared memory with Input(MasterParam).")
+        .AsDispensable();
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+    AddAttr<bool>(
+        "lazy_mode",
+        "(bool, default false) "
+        "only update the parameter that has gradient in sparse update")
+        .SetDefault(false);
+    AddAttr<int64_t>("min_row_size_to_use_multithread",
+                     "(int64_t, default 0) "
+                     "when not zero, if param row size is larger then "
+                     "min_row_size_to_use_multithread and "
+                     "inner_op_parallelism is larger then 0, sparse update "
+                     "will run in multithread mode")
+        .SetDefault(1000);
+    AddAttr<bool>("multi_precision",
+                  "(bool, default false) "
+                  "Whether to use multi-precision during weight updating.")
+        .SetDefault(false);
+    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
+    // as dispensable since they are not used when use_global_beta_pow is true.
+    AddAttr<bool>("use_global_beta_pow",
+                  "(bool, default false) "
+                  "Whether to use global beta_pow for whole model instead of "
+                  "creating beta_pow for each parameter.")
+        .SetDefault(false);
+
+    AddComment(R"DOC(
+Adam Optimizer.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+
+Adam updates:
+
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adamw_op.cc b/paddle/fluid/operators/optimizers/adamw_op.cc
new file mode 100644
index 0000000000000..e2670625d4e50
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adamw_op.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/optimizers/adam_op.h"
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamWOp : public AdamOp {
+  using AdamOp::AdamOp;
+};
+
+class AdamWOpMaker : public AdamOpMaker {
+ public:
+  void Make() {
+    AdamOpMaker::Make();
+    AddAttr<float>("lr_ratio",
+                   "(float, default 1.0) "
+                   "layerwise learning rate decay")
+        .SetDefault(1.0f);
+    AddAttr<float>("coeff",
+                   "(float, default 0.01) "
+                   "coeff of the weight decay")
+        .SetDefault(0.01f);
+    AddAttr<bool>("with_decay",
+                  "(bool, default false) "
+                  "whether to do weight decay")
+        .SetDefault(false);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(adamw, AdamwInferMetaFunctor,
+                            PD_INFER_META(phi::AdamwInferMeta));
+REGISTER_OPERATOR(
+    adamw, ops::AdamWOp, ops::AdamWOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdamwInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index 161483c3420fc..0159e250d317e 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -100,6 +100,10 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddOutput("FP16FusedParamOut", "The updated FP16FusedParam.")
         .AsDispensable();
+    AddOutput("FP32AccFusedGrad", "The accumulated FP32 gradients.")
+        .AsDispensable();
+    AddOutput("FP16AccFusedGrad", "The accumulated FP16 gradients.")
+        .AsDispensable();
 
     AddOutput("Moment1Out", "The updated Moment1.");
     AddOutput("Moment2Out", "The updated Moment2.");
@@ -110,8 +114,14 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable();
 
     AddOutput("FoundInf", "Whether there is NaN/Inf");
+    AddOutput("AccStep", "The training steps.").AsDispensable();
+    AddOutput("StopUpdate",
+              "Whether the parameter updating is stopped when the gradient "
+              "accumulated steps is less than Attr(acc_steps).")
+        .AsDispensable();
     AddOutput("Step", "The global step which excludes the NaN/Inf step.");
 
+    AddAttr<int>("acc_steps", "The gradient accumulation steps.").SetDefault(1);
     AddAttr<float>("beta1", "The initial Beta1Pow value.");
     AddAttr<float>("beta2", "The initial Beta2Pow value.");
     AddAttr<float>("epsilon",
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index f445a140f27a3..c857c6de4d093 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -1041,6 +1041,58 @@ static void CheckHasNanInfGrad(const float *fp32_grad, int fp32_numel,
   }
 }
 
+template <typename T1, typename T2, typename T3, int VecSize>
+static __global__ void ElementwiseAddWithCastCUDAKernel(const T1 *x,
+                                                        const T2 *y, T3 *z,
+                                                        int n) {
+  static_assert(sizeof(T1) <= sizeof(T2),
+                "sizeof(T1) must be smaller than sizeof(T2).");
+  using MT = MasterT<T2>;
+
+  int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  int stride = (blockDim.x * gridDim.x) * VecSize;
+  for (; i + VecSize <= n; i += stride) {
+    phi::AlignedVector<T1, VecSize> x_vec;
+    phi::AlignedVector<T2, VecSize> y_vec;
+    phi::AlignedVector<T3, VecSize> z_vec;
+    phi::Load(x + i, &x_vec);
+    phi::Load(y + i, &y_vec);
+#pragma unroll
+    for (int j = 0; j < VecSize; ++j) {
+      auto x_tmp = static_cast<MT>(x_vec[j]);
+      auto y_tmp = static_cast<MT>(y_vec[j]);
+      z_vec[j] = static_cast<T3>(x_tmp + y_tmp);
+    }
+    phi::Store(z_vec, z + i);
+  }
+
+  for (; i < n; ++i) {
+    auto x_tmp = static_cast<MT>(x[i]);
+    auto y_tmp = static_cast<MT>(y[i]);
+    z[i] = static_cast<T3>(x_tmp + y_tmp);
+  }
+}
+
+template <typename T1, typename T2, typename T3>
+static void LaunchElementwiseAddWithCastKernel(
+    const platform::CUDADeviceContext &dev_ctx, const T1 *x, const T2 *y, T3 *z,
+    int n, gpuStream_t stream) {
+  int vec_size =
+      std::min(std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)),
+               GetChunkedVecSize(z, 0));
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
+
+#define PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL                            \
+  do {                                                                        \
+    ElementwiseAddWithCastCUDAKernel<T1, T2, T3, kVecSize><<<                 \
+        config.block_per_grid, config.thread_per_block, 0, stream>>>(x, y, z, \
+                                                                     n);      \
+  } while (0)
+
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL);
+#undef PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL
+}
+
 template <typename T>
 class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -1051,6 +1103,9 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     auto stream = dev_ctx.stream();
     auto place = dev_ctx.GetPlace();
 
+    auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
+    found_inf_t->Resize({1});
+
     // Step 1: Get fp16 param and grad tensors
     int64_t fp16_numel;
     auto *fp16_param = GetSameInOutTensorPtr<platform::float16, true>(
@@ -1095,6 +1150,128 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                           "Too many parameter number. Only <= %d is supported.",
                           std::numeric_limits<int>::max()));
 
+    auto acc_steps = ctx.Attr<int>("acc_steps");
+    PADDLE_ENFORCE_GE(
+        acc_steps, 1,
+        platform::errors::InvalidArgument(
+            "The gradient accumulation steps should be not less than 1."));
+    if (acc_steps > 1) {
+      auto *step_t = ctx.Output<framework::Tensor>("AccStep");
+      PADDLE_ENFORCE_NOT_NULL(
+          step_t,
+          platform::errors::InvalidArgument(
+              "Output(AccStep) cannot be nullptr when Attr(acc_steps) > 1."));
+      bool is_initialized = step_t->IsInitialized();
+      int64_t *step_ptr;
+      if (is_initialized) {
+        step_ptr = step_t->mutable_data<int64_t>(platform::CPUPlace());
+        ++(*step_ptr);
+      } else {
+        step_t->Resize({1});
+        step_ptr = step_t->mutable_data<int64_t>(platform::CPUPlace());
+        *step_ptr = 1;
+      }
+      int64_t rounded_step = (*step_ptr) % acc_steps;
+
+      float *fp32_acc_grad = nullptr;
+      if (has_fp32_param) {
+        auto *fp32_acc_grad_t =
+            ctx.Output<framework::Tensor>("FP32AccFusedGrad");
+        PADDLE_ENFORCE_NOT_NULL(
+            fp32_acc_grad_t, platform::errors::InvalidArgument(
+                                 "Output(FP32AccFusedGrad) cannot be nullptr "
+                                 "when Attr(acc_steps) > 1."));
+        if (!fp32_acc_grad_t->IsInitialized()) {
+          fp32_acc_grad_t->Resize({static_cast<int64_t>(fp32_numel)});
+          fp32_acc_grad = fp32_acc_grad_t->mutable_data<float>(place);
+        } else {
+          fp32_acc_grad = fp32_acc_grad_t->data<float>();
+        }
+      }
+
+      platform::float16 *fp16_acc_grad = nullptr;
+      float *master_acc_grad = nullptr;
+      if (has_fp16_param) {
+        auto *fp16_acc_grad_t =
+            ctx.Output<framework::Tensor>("FP16AccFusedGrad");
+        PADDLE_ENFORCE_NOT_NULL(
+            fp16_acc_grad_t, platform::errors::InvalidArgument(
+                                 "Output(FP16AccFusedGrad) cannot be nullptr "
+                                 "when Attr(acc_steps) > 1."));
+        if (!fp16_acc_grad_t->IsInitialized()) {
+          fp16_acc_grad_t->Resize({static_cast<int64_t>(3 * fp16_numel)});
+          fp16_acc_grad =
+              fp16_acc_grad_t->mutable_data<platform::float16>(place);
+        } else {
+          fp16_acc_grad = fp16_acc_grad_t->data<platform::float16>();
+        }
+        master_acc_grad = reinterpret_cast<float *>(fp16_acc_grad + fp16_numel);
+      }
+
+      // Inplace addto
+      if (has_fp32_param) {
+        if (rounded_step == 1) {
+          memory::Copy(place, fp32_acc_grad, place, fp32_grad,
+                       fp32_numel * sizeof(float), stream);
+        } else {
+          LaunchElementwiseAddWithCastKernel(dev_ctx, fp32_grad, fp32_acc_grad,
+                                             fp32_acc_grad, fp32_numel, stream);
+        }
+      }
+
+      if (has_fp16_param) {
+        if (acc_steps == 2) {
+          if (rounded_step == 0) {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_acc_grad,
+                                               fp16_grad, fp16_acc_grad,
+                                               fp16_numel, stream);
+          } else {
+            memory::Copy(place, fp16_acc_grad, place, fp16_grad,
+                         fp16_numel * sizeof(platform::float16), stream);
+          }
+        } else {  // acc_steps >= 3
+          if (rounded_step == 0) {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_grad,
+                                               master_acc_grad, fp16_acc_grad,
+                                               fp16_numel, stream);
+          } else if (rounded_step == 1) {
+            memory::Copy(place, fp16_acc_grad, place, fp16_grad,
+                         fp16_numel * sizeof(platform::float16), stream);
+          } else if (rounded_step == 2) {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_grad,
+                                               fp16_acc_grad, master_acc_grad,
+                                               fp16_numel, stream);
+          } else {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_grad,
+                                               master_acc_grad, master_acc_grad,
+                                               fp16_numel, stream);
+          }
+        }
+      }
+
+      auto *stop_update_t = ctx.Output<framework::Tensor>("StopUpdate");
+      stop_update_t->Resize({1});
+      auto *stop_update =
+          stop_update_t->mutable_data<bool>(platform::CPUPlace());
+
+      auto *found_inf_cpu =
+          found_inf_t->mutable_data<bool>(platform::CPUPlace());
+
+      if (rounded_step != 0) {
+        *stop_update = true;
+        auto *found_inf_cpu =
+            found_inf_t->mutable_data<bool>(platform::CPUPlace());
+        *found_inf_cpu = false;
+        return;
+      } else {
+        // swap pointer
+        fp32_grad = fp32_acc_grad;
+        fp16_grad = fp16_acc_grad;
+        *stop_update = false;
+        found_inf_t->clear();
+      }
+    }
+
     // Step 3: Get ParamInfo
     const auto *param_info_tensor = GetInputTensorPtr<int>(ctx, "ParamInfo");
     auto fp32_local_start_idx = param_info_tensor[0];
@@ -1122,7 +1299,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
             << " , fp16_global_param_num = " << fp16_global_param_num;
 
     // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow,
-    // GlobalScale, FoundInf
+    // GlobalScale
     const auto *global_scale = GetInputTensorPtr<float>(ctx, "GlobalScale");
     const auto *lr = GetInputTensorPtr<float>(ctx, "LearningRate");
     int64_t partial_numel = 0;
@@ -1157,8 +1334,6 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     auto *beta2pow =
         GetSameInOutTensorPtr<float>(ctx, place, "Beta2Pow", "Beta2PowOut");
 
-    auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
-    found_inf_t->Resize({1});
     auto *found_inf = found_inf_t->mutable_data<bool>(place);
 
     // Step 5: Get attributes weight_decay, beta1, beta2, epsilon,
diff --git a/paddle/fluid/operators/pixel_unshuffle_op.cc b/paddle/fluid/operators/pixel_unshuffle_op.cc
new file mode 100644
index 0000000000000..8d16e02c04c83
--- /dev/null
+++ b/paddle/fluid/operators/pixel_unshuffle_op.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+class PixelUnshuffleOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+};
+
+class PixelUnshuffleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of PixelUnshuffleOp, the layout is "
+             "[N, C, H, W] or [N, H, W, C].");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), the output of "
+              "PixelUnshuffleOp. The layout is [N, C*factor^2, H/factor, "
+              "W/factor] or [N, H/factor, W/factor, C*factor^2].");
+    AddAttr<int>("downscale_factor",
+                 "the factor to decrease spatial resolution by.")
+        .SetDefault(1);
+    AddAttr<std::string>(
+        "data_format",
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\", Specify the data format of the input data.")
+        .SetDefault("NCHW");
+
+    AddComment(R"DOC(
+		Pixel Unshuffle operator
+		This operator rearranges elements in a tensor of shape :math:`(*, C, H, W)`
+    		to a tensor of shape :math:`(*, C\times r^2, H / r, W / r)`.
+
+		This operation is the reversion of PixelShuffle operation.
+
+		Please refer to the paper:
+		 `Real-Time Single Image and Video Super-Resolution Using an Efficient 
+		 Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
+    		by Shi et. al (2016) for more details. 
+
+        )DOC");
+  }
+};
+
+template <typename T>
+class PixelUnshuffleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("pixel_unshuffle_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+class PixelUnshuffleGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(pixel_unshuffle, PixelUnshuffleInferShapeFunctor,
+                            PD_INFER_META(phi::PixelUnshuffleInferMeta));
+
+REGISTER_OPERATOR(pixel_unshuffle, ops::PixelUnshuffleOp,
+                  ops::PixelUnshuffleOpMaker,
+                  ops::PixelUnshuffleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::PixelUnshuffleGradOpMaker<paddle::imperative::OpBase>,
+                  PixelUnshuffleInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(pixel_unshuffle_grad,
+                            PixelUnshuffleGradInferShapeFunctor,
+                            PD_INFER_META(phi::PixelUnshuffleGradInferMeta));
+
+REGISTER_OPERATOR(pixel_unshuffle_grad, ops::PixelUnshuffleGradOp,
+                  PixelUnshuffleGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
old mode 100644
new mode 100755
index 2340f443c49fb..4ffca35ea5694
--- a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #if defined PADDLE_WITH_PSCORE
 #include <stdlib.h>
 
+#include <cmath>
+#include <fstream>
+#include <iostream>
 #include <memory>
 #include <random>
 #include <sstream>
@@ -69,44 +72,6 @@ void StartSwitchServer(
     std::vector<std::string> peer_endpoints) {
   switch_server_ptr->SetPeerEndPoints(peer_endpoints);
   switch_server_ptr->SetEndPoint(endpoints[0]);
-  /*
-    std::shared_ptr<distributed::SendAndRecvVariableHandler> b_req_handler;
-    b_req_handler.reset(new distributed::SendAndRecvVariableHandler());
-    switch_server_ptr->SetServiceHandler(b_req_handler);
-
-    switch_server_ptr->SetLocalScope();
-
-    switch_server_ptr->RegisterServiceHandler(
-        std::to_string(distributed::PS_SAVE_WITH_SCOPE),
-        [&](const MultiVarMsg* request, MultiVarMsg* response,
-            brpc::Controller* cntl) -> int {
-          return b_req_handler->SaveInSwitchWithScope(request, response, cntl);
-        });
-
-    switch_server_ptr->RegisterServiceHandler(std::to_string(distributed::PS_SAVE_WITH_SHARD),
-                           [&](const MultiVarMsg* request, MultiVarMsg*
-    response,
-                               brpc::Controller* cntl) -> int {
-                             return b_req_handler->SaveInSwitchWithShard(
-                                 request, response, cntl);
-                           });
-
-    switch_server_ptr->RegisterServiceHandler(std::to_string(distributed::PS_QUERY_WITH_SCOPE),
-                           [&](const MultiVarMsg* request, MultiVarMsg*
-    response,
-                               brpc::Controller* cntl) -> int {
-                             return b_req_handler->QueryInSwitchWithScope(
-                                 request, response, cntl);
-                           });
-
-    switch_server_ptr->RegisterServiceHandler(std::to_string(distributed::PS_QUERY_WITH_SHARD),
-                           [&](const MultiVarMsg* request, MultiVarMsg*
-    response,
-                               brpc::Controller* cntl) -> int {
-                             return b_req_handler->QueryInSwitchWithShard(
-                                 request, response, cntl);
-                           });
-  */
   switch_server_ptr->StartHeterService(false);
 }
 
@@ -119,6 +84,130 @@ void StartSwitchInterServer(
   switch_server_ptr->StartHeterInterService(false);
 }
 
+void TestShardSendRecv(
+    std::shared_ptr<distributed::HeterClient> heter_client_ptr_) {
+  auto send_async = [&]() -> void {
+    std::vector<int64_t> vars_len{2 * sizeof(float),
+                                  4 * sizeof(float)};  // 字节数
+    std::vector<float> values{1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    int64_t data_size = 6 * sizeof(float);
+    std::vector<std::string> send_var_names{"w", "x"};
+    int group_id = 0;
+    int ret = heter_client_ptr_->Send(group_id, send_var_names, vars_len,
+                                      values.data(), data_size);
+    if (!ret) {
+      LOG(INFO) << ">>>> TestShardSendRecv: worker send success";
+    }
+  };
+  std::thread t(send_async);
+
+  int group_id = 0;
+  std::vector<std::string> recv_var_names{"w", "x"};
+  int data_size = 6 * sizeof(float);
+  float* value_ptr = new float[6];
+  int ret =
+      heter_client_ptr_->Recv(group_id, recv_var_names, value_ptr, data_size);
+  if (!ret) {
+    VLOG(4) << "queried data is: ";
+    for (int i = 0; i < 6; i++) {
+      VLOG(4) << value_ptr[i] << " ";
+    }
+    delete[] value_ptr;
+    LOG(INFO) << "<<<< TestShardSendRecv: worker recv success";
+  }
+
+  t.join();
+}
+
+void PressTestSendRecv(
+    std::shared_ptr<distributed::HeterClient> heter_client_ptr_) {
+  // long l = 0, m = 0;
+  // https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/send_20_34
+  std::ifstream file("/send_20_34", std::ios::in | std::ios::binary);
+  // l = file.tellg();
+  // file.seekg(0, std::ios::end);
+  // m = file.tellg();
+  // file.close();
+  // VLOG(0) << "size of file " << "20_34" << " is " << (m - l) << " bytes.\n";
+  int64_t vars_len = 2359296 * sizeof(float);
+  int64_t data_size = vars_len;
+  VLOG(0) << "float num: " << data_size;
+  float* data_ptr = new float[data_size];
+  file.read((char*)data_ptr, 9437184);
+  VLOG(0) << "send data is: " << data_ptr[0] << ", " << data_ptr[1];
+  std::vector<std::string> var_names{"34"};
+  int loopCnt = 10000;
+  auto send_async = [&]() -> void {
+    int i = 0;
+    while (i++ < loopCnt) {
+      heter_client_ptr_->Send(20, var_names, {vars_len}, data_ptr, data_size);
+    }
+  };
+  std::thread t(send_async);
+  float* values = new float[2359296];
+  int i = 0;
+  while (i++ < loopCnt) {
+    int ret = heter_client_ptr_->Recv(20, var_names, values, data_size);
+    if (!ret) {
+      VLOG(0) << "diff: " << abs(values[0] - 0.159544) << ", "
+              << abs(values[1] + 2.3484);
+      VLOG(0) << "loop id: " << i;
+      for (int j = 0; j < 2359296; j++) {
+        if (abs(values[j] - data_ptr[j]) > 4e-6) {
+          VLOG(0) << "error data idx: " << j;
+          VLOG(0) << "diff detail: " << values[j] << ", " << data_ptr[j];
+          LOG(INFO) << ">>>> worker recv ERROR";
+          break;
+        }
+      }
+      for (uint32_t i = 0; i < 2359296; i++) {
+        values[i] = -1;  // reset
+      }
+    }
+  }
+  delete[] values;
+
+  std::ofstream recv("/recv_20_34", std::ios::out | std::ios::binary);
+  recv.write((char*)values, data_size);
+  recv.close();
+  t.join();
+}
+
+void TestScopeSendRecv(
+    std::shared_ptr<distributed::HeterClient> heter_client_ptr_) {
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  framework::Executor exe(place);
+  std::shared_ptr<framework::Scope> send_scope_ptr =
+      std::make_shared<framework::Scope>();
+  int64_t rows_numel = 10;
+  InitTensorsOnClient(send_scope_ptr.get(), &place, rows_numel);
+  LOG(INFO) << "InitTensorsOnClient done";
+  auto send_async = [&]() -> void {
+    std::string message_name = std::to_string(distributed::PS_SAVE_WITH_SCOPE);
+    std::vector<std::string> send_var_names{"w", "x"};
+    int ret = heter_client_ptr_->Send(ctx, *send_scope_ptr, message_name,
+                                      send_var_names);
+    if (!ret) {
+      LOG(ERROR) << ">>>> TestScopeSendRecv: worker send success";
+    }
+  };
+  std::thread t(send_async);
+
+  std::string message_name = std::to_string(distributed::PS_QUERY_WITH_SCOPE);
+  std::vector<std::string> recv_var_names{"w", "x"};
+  std::shared_ptr<framework::Scope> recv_scope_ptr =
+      std::make_shared<framework::Scope>();
+  int ret = heter_client_ptr_->Recv(ctx, *recv_scope_ptr, message_name,
+                                    recv_var_names);
+  if (!ret && recv_scope_ptr->FindVar("w") && recv_scope_ptr->FindVar("x")) {
+    LOG(INFO) << "<<<< TestScopeSendRecv: worker recv success";
+  } else {
+    LOG(INFO) << "<<<< TestScopeSendRecv: worker recv failed";
+  }
+  t.join();
+}
+
 TEST(HETERSENDANDRECV, CPU) {
   setenv("http_proxy", "", 1);
   setenv("https_proxy", "", 1);
@@ -155,79 +244,19 @@ TEST(HETERSENDANDRECV, CPU) {
   switch_server_ptr_b->WaitServerReady();
 
   // 获取 client 实例
+  // 开启单测时，请重新设置 HeterClient 端的 recv_switch_channels_
   std::shared_ptr<distributed::HeterClient> heter_client_ptr_ =
       distributed::HeterClient::GetInstance(
           {switch_a_endpoint, switch_b_endpoint}, {}, 0);
 
+  framework::ProgramDesc program;
   platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
   framework::Executor exe(place);
-
-  framework::ProgramDesc program;
   exe.Prepare(program, 0);  // solve undefined symbol: tensor_table.cc
-  std::shared_ptr<framework::Scope> send_scope_ptr =
-      std::make_shared<framework::Scope>();
-  int64_t rows_numel = 10;
-  InitTensorsOnClient(send_scope_ptr.get(), &place, rows_numel);
-  LOG(INFO) << "InitTensorsOnClient done";
-
-  auto send_async = [&]() -> void {
-    /*
-    //std::string message_name =
-    std::to_string(distributed::PS_SAVE_WITH_SCOPE);
-    std::string message_name = "send and save";
-    std::vector<std::string> send_var_names{"w", "x"};
-    int ret = heter_client_ptr_->Send(ctx, *send_scope_ptr, message_name,
-                                      send_var_names);
-    if (!ret) {
-      LOG(ERROR) << ">>>> worker send success";
-    }
-    */
-    ///*
-    std::vector<int> vars_len{2, 4};
-    std::vector<float> values{1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-    int64_t data_size = 6;
-    std::vector<std::string> send_var_names{"w", "x"};
-    int group_id = 0;
-    int ret = heter_client_ptr_->Send(group_id, send_var_names, vars_len,
-                                      values.data(), data_size);
-    if (!ret) {
-      LOG(INFO) << ">>>> worker send success";
-    }
-    //*/
-  };
-  std::thread send_thread(send_async);
-  /*
-  std::string message_name = std::to_string(distributed::PS_QUERY_WITH_SCOPE);
-  std::vector<std::string> recv_var_names{"w", "x"};
-  std::shared_ptr<framework::Scope> recv_scope_ptr =
-      std::make_shared<framework::Scope>();
-  int ret = heter_client_ptr_->Recv(ctx, *recv_scope_ptr, message_name,
-                                    recv_var_names);
-  if (!ret && recv_scope_ptr->FindVar("w") && recv_scope_ptr->FindVar("x")) {
-    LOG(INFO) << ">>>> worker recv success";
-  } else {
-    LOG(INFO) << "worker recv failed";
-  }
-  */
-  ///*
-  int group_id = 0;
-  std::vector<std::string> recv_var_names{"w", "x"};
-  std::vector<float> values;
-  int data_size = 6;
-  values.resize(data_size);
-  int ret = heter_client_ptr_->Recv(group_id, recv_var_names, values.data(),
-                                    data_size);
-  if (!ret) {
-    VLOG(4) << "queried data is: ";
-    for (auto f : values) {
-      VLOG(4) << f << " ";
-    }
-    LOG(INFO) << ">>>> worker recv success";
-  }
-  //*/
 
-  send_thread.join();
+  // TestScopeSendRecv(heter_client_ptr_);
+  // TestShardSendRecv(heter_client_ptr_);
+  PressTestSendRecv(heter_client_ptr_);
 
   switch_server_ptr_a->Stop();
   LOG(INFO) << "switch server A stopped";
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
index 16c7a4794bb50..b33859153419c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
index f9f015804e11d..037dab396c757 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index ff1ddb4175fef..76641698ead67 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
 
-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
 #endif
@@ -613,7 +613,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
   virtual std::string GetOpType() const = 0;
 };
 
-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 template <typename T, template <typename> class ReduceOp,
           template <typename, typename> class TransformOp>
 class ReduceCudaKernel : public framework::OpKernel<T> {
@@ -626,9 +626,12 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
     auto pt_out_dtype = paddle::framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-
+#ifdef PADDLE_WITH_XPU_KP
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+#else
     auto& dev_ctx = context.cuda_device_context();
-
+#endif
     if (out_dtype >= 0) {
       output->mutable_data(dev_ctx.GetPlace(), pt_out_dtype);
     } else {
@@ -642,6 +645,7 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
   }
 };
 
+#ifndef PADDLE_WITH_XPU_KP
 template <typename T, template <typename, typename> class TransformOp>
 class ReduceCudaGradKernel : public framework::OpKernel<T> {
  public:
@@ -686,6 +690,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
   }
 };
 #endif
+#endif
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 2a78774f3706e..6b8e6b8f8054f 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -89,6 +89,12 @@ class ReduceSumVarTypeInference : public paddle::framework::VarTypeInference {
         BOOST_GET_CONST(int, ctx->GetAttr("out_dtype")));
     if (data_type >= 0) {
       ctx->SetOutputDataType("Out", data_type);
+    } else {
+      auto x_type = ctx->GetInputDataType("X");
+      if (x_type == framework::proto::VarType::BOOL ||
+          x_type == framework::proto::VarType::INT32) {
+        ctx->SetOutputDataType("Out", framework::proto::VarType::INT64);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index 84b0f403be038..4af355bfca641 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -23,6 +23,19 @@ namespace operators {
 class SizeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto dtype = framework::proto::VarType::FP32;  // dtype is not important
+    return framework::OpKernelType(dtype, ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return expected_kernel_type;
+  }
 };
 
 class SizeOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -40,6 +53,8 @@ Return the number of elements in the input.
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(SizeOpNoNeedBufferVarInferer, "Input");
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -50,4 +65,4 @@ REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    SizeInferShapeFunctor);
+    SizeInferShapeFunctor, ops::SizeOpNoNeedBufferVarInferer);
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 89e3b74bb3aca..eb82389702ca4 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
@@ -49,6 +50,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);
 
+#ifdef PADDLE_WITH_TESTING
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
+                            "Whether to print the message of gpu memory usage "
+                            "at exit, mainly used for UT and CI.");
+#endif
+
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
 USE_GPU_MEM_STAT;
@@ -137,12 +144,31 @@ class RecordedGpuMallocHelper {
     if (NeedRecord()) {
       mtx_.reset(new std::mutex());
     }
+
+#ifdef PADDLE_WITH_TESTING
+    if (FLAGS_enable_gpu_memory_usage_log) {
+      // A fake UPDATE to trigger the construction of memory stat instances,
+      // make sure that they are destructed after RecordedGpuMallocHelper.
+      MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+    }
+#endif
   }
 
   DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper);
 
  public:
+  ~RecordedGpuMallocHelper() {
+#ifdef PADDLE_WITH_TESTING
+    if (FLAGS_enable_gpu_memory_usage_log) {
+      std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : "
+                << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << std::endl;
+    }
+#endif
+  }
+
   static RecordedGpuMallocHelper *Instance(int dev_id) {
+    static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
+
     std::call_once(once_flag_, [] {
       int dev_cnt = GetGPUDeviceCount();
       instances_.reserve(dev_cnt);
@@ -326,14 +352,11 @@ class RecordedGpuMallocHelper {
   mutable std::unique_ptr<std::mutex> mtx_;
 
   static std::once_flag once_flag_;
-  static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
 
   std::set<void *> gpu_ptrs;  // just for testing
 };                            // NOLINT
 
 std::once_flag RecordedGpuMallocHelper::once_flag_;
-std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
-    RecordedGpuMallocHelper::instances_;
 
 gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
                              bool malloc_managed_memory) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 012294d0fff85..0871624a5d749 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -13,12 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/device/ipu/ipu_executor.h"
 
 namespace paddle {
 namespace platform {
@@ -40,7 +38,7 @@ IpuBackend::~IpuBackend() {
   executor_.reset();
 }
 
-void IpuBackend::Compile(Graph* graph,
+void IpuBackend::Compile(framework::ir::Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
   VLOG(10) << "enter IpuBackend::Compile";
@@ -63,8 +61,8 @@ void IpuBackend::Compile(Graph* graph,
   VLOG(10) << "leave IpuBackend::Compile";
 }
 
-void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
-                     const std::vector<Tensor*>& outputs,
+void IpuBackend::Run(const std::vector<const framework::Tensor*>& inputs,
+                     const std::vector<framework::Tensor*>& outputs,
                      const framework::ExecutionContext& ctx) {
   timer_->Start();
   executor_->Run(inputs, outputs, ctx);
@@ -82,7 +80,7 @@ void IpuBackend::Reset() {
   executor_.reset();
 }
 
-void IpuBackend::SetScope(const Scope& scope) {
+void IpuBackend::SetScope(const framework::Scope& scope) {
   scope_ = &scope;
   executor_->SetScope(&scope);
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index 0578d9face675..1e083e7a3518c 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -18,26 +18,25 @@ limitations under the License. */
 #include <popart/names.hpp>
 #include <popart/tensorinfo.hpp>
 
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
-#include "paddle/fluid/platform/device/ipu/ipu_device.h"
-#include "paddle/fluid/platform/device/ipu/ipu_executor.h"
 #include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
-#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/timer.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-// IpuBackend is the center of paddle-ipu, its function include:
-//   1. Compile paddle model to popart model
-//   2. Run popart model, inference or training
-//   3. Request and release device
-//   4. Other helper function
+class IpuStrategy;
+class Compiler;
+class Executor;
+
 class IpuBackend {
  public:
   static IpuBackend *GetInstance();
@@ -46,47 +45,46 @@ class IpuBackend {
   IpuBackend();
   ~IpuBackend();
 
-  // what compile does include(call compiler_):
-  //   1. map paddle-op -> poart op
-  //   2. construct popart onnx compute graph
-  void Compile(Graph *graph, const std::vector<std::string> &feed_list,
+  // What compile method does:
+  // Convert paddle ops to popart ops;
+  // Construct a popart graph, which is a onnx compute graph;
+  // Load the graph and weights to ipu.
+  void Compile(framework::ir::Graph *graph,
+               const std::vector<std::string> &feed_list,
                const std::vector<std::string> &fetch_list);
 
-  // what run does include:
-  //   1. construct forward onnx graph
-  //   2. graph-level optimization
-  //   3. autodiff
-  void Run(const std::vector<const Tensor *> &inputs,
-           const std::vector<Tensor *> &outputs,
+  // Run the compiled graph on ipu
+  void Run(const std::vector<const framework::Tensor *> &inputs,
+           const std::vector<framework::Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
   // Sync weights from IPU while training
   void WeightsToHost();
 
-  // detach IPU manually
+  // Detach IPU manually
   void Detach();
 
-  // reset manually
-  // call it before destruct works
+  // Reset manually
+  // Call it before destruct works
   void Reset();
 
-  void SetScope(const Scope &scope);
-  const Scope *GetScope() { return scope_; }
+  void SetScope(const framework::Scope &scope);
+  const framework::Scope *GetScope() { return scope_; }
   void SetIpuStrategy(const IpuStrategy &strategy);
   const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
 
-  // save compiled model to onnx
+  // Save compiled model to onnx
   void SaveModelProto(const std::string &path);
 
  private:
-  // not own
-  const Scope *scope_ = nullptr;
+  // Not own
+  const framework::Scope *scope_ = nullptr;
   const IpuStrategy *ipu_strategy_ = nullptr;
 
-  // own
+  // Own
   std::unique_ptr<Compiler> compiler_;
   std::unique_ptr<Executor> executor_;
-  std::unique_ptr<platform::Timer> timer_;
+  std::unique_ptr<Timer> timer_;
 
   bool is_compiled_ = false;
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index 7ae3b2303decd..f2a37aae369ec 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -20,12 +20,110 @@
 #include <popart/sgd.hpp>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_names.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+namespace {
+
+struct CustomOpAttrVisitor : public boost::static_visitor<void> {
+  CustomOpAttrVisitor(std::map<std::string, popart::any>* attr,
+                      const std::string& attr_name)
+      : attrs_(attr), attr_name_(attr_name) {}
+
+  mutable std::map<std::string, popart::any>* attrs_;
+  std::string attr_name_;
+
+  void operator()(int v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(float v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::string& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<int>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<float>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<std::string>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(bool v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<bool>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(BlockDesc* desc) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type when extracting "
+        "custom operator attributes."));
+  }
+  void operator()(const std::vector<BlockDesc*>& v) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type when extracting  "
+        "custom operator attributes."));
+  }
+  void operator()(int64_t v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<int64_t>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<double>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(boost::blank) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `boost::blank` type when extracting "
+        "custom operator attributes."));
+  }
+};
+
+struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
+  ConstantOpAttrVisitor(framework::LoDTensor* tensor, VarType::Type dtype)
+      : tensor_(tensor), dtype_(dtype) {}
+
+  framework::LoDTensor* tensor_;
+  VarType::Type dtype_;
+
+  void operator()(const std::vector<int>& vec) const {
+    framework::TensorFromVector<int>(vec, tensor_);
+  }
+  void operator()(const std::vector<float>& vec) const {
+    if (dtype_ == VarType::FP16) {
+      std::vector<float16> vec_fp16;
+      std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16),
+                     [](float f) -> float16 { return float16(f); });
+      framework::TensorFromVector<float16>(vec_fp16, tensor_);
+    } else {
+      framework::TensorFromVector<float>(vec, tensor_);
+    }
+  }
+  void operator()(const std::vector<bool>& vec) const {
+    framework::TensorFromVector<bool>(vec, tensor_);
+  }
+  void operator()(const std::vector<int64_t>& vec) const {
+    framework::TensorFromVector<int64_t>(vec, tensor_);
+  }
+  void operator()(const std::vector<double>& vec) const {
+    framework::TensorFromVector<double>(vec, tensor_);
+  }
+#define RAISE_ERROR \
+  PADDLE_THROW(     \
+      platform::errors::InvalidArgument("Constant value must be a vector"))
+  void operator()(int v) const { RAISE_ERROR; }
+  void operator()(float v) const { RAISE_ERROR; }
+  void operator()(const std::string& v) const { RAISE_ERROR; }
+  void operator()(const std::vector<std::string>& v) const { RAISE_ERROR; }
+  void operator()(bool v) const { RAISE_ERROR; }
+  void operator()(BlockDesc* desc) const { RAISE_ERROR; }
+  void operator()(const std::vector<BlockDesc*>& v) const { RAISE_ERROR; }
+  void operator()(int64_t v) const { RAISE_ERROR; }
+  void operator()(boost::blank) const { RAISE_ERROR; }
+#undef RAISE_ERROR
+};
+
 popart::AdamMode AdamModeFromStr(const std::string& str,
                                  const bool& use_no_bias_optimizer) {
   if (str == "adam") {
@@ -117,6 +215,34 @@ TO GetCastSigAttrAllowNull(std::string attr, OpDesc* op_desc) {
   }
 }
 
+// Helper for adding namescope info
+struct NameScopeHelper {
+  NameScopeHelper(const OpDesc* op, popart::Builder* builder);
+
+  ~NameScopeHelper() {
+    if (pushed_) {
+      builder_->popNameScope();
+    }
+  }
+
+  bool pushed_ = false;
+  popart::Builder* builder_;
+};
+
+NameScopeHelper::NameScopeHelper(const OpDesc* op, popart::Builder* builder)
+    : builder_(builder) {
+  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
+  if (op_namescope.empty() || op_namescope == "/") {
+    return;
+  }
+  op_namescope.pop_back();
+  op_namescope.erase(op_namescope.begin());
+  builder->pushNameScope(op_namescope);
+  pushed_ = true;
+}
+
+}  // namespace
+
 GraphHelper::GraphHelper(const Graph* g) {
   graph = g;
   sorted_ops = framework::ir::TopologySortOperations(*g);
@@ -181,17 +307,12 @@ void Compiler::RegisterOpFunc() {
      auto op_type = op_desc->Type();                          \
      VLOG(10) << "build op:" << op_type << " args " << #Args; \
      auto inputs = GetOpInputs(op_desc);                      \
-     auto output_names = GetOpOutputs(op_desc);               \
      auto debug_context = BuildDebugContext(op_desc);         \
      auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1();   \
      auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
-     PushNameScope(op_desc);                                  \
+     NameScopeHelper ns_helper(op_desc, builder_.get());      \
      auto output_ids = OnnxImpl(inputs Args, debug_context);  \
-     PopNameScope(op_desc);                                   \
-     SetIpuIndexStage(output_ids, op_desc);                   \
-     SetAMPAttributes(output_ids, op_desc);                   \
-     SetSerializeAttributes(output_ids, op_desc);             \
-     InsertTensors(output_names, output_ids);                 \
+     PostLower(output_ids, op_desc);                          \
    }},  // NOLINT
 #include "paddle/fluid/platform/device/ipu/supported_ops_autogen.h"
 #include "paddle/fluid/platform/device/ipu/supported_ops_custom.h"
@@ -222,7 +343,7 @@ void Compiler::InitInputs(const std::vector<std::string>& feed_list) {
     auto* node = graph_helper_->vars_name_map[feed_name];
     auto* var_desc = node->Var();
     VLOG(10) << "feed_name= " << var_desc->Name();
-    auto data_type = VarType2PopartType(var_desc->GetDataType());
+    auto data_type = VarType2PopartDType(var_desc->GetDataType());
     popart::TensorInfo input_info{data_type, var_desc->GetShape()};
     VLOG(10) << "popart input_info = " << input_info;
     popart::TensorId tensor_id =
@@ -258,8 +379,9 @@ void Compiler::LowerConstants(const Scope* scope) {
       auto shape =
           BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("dims"));
       auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype"));
-      auto dtype = PopartType2VarType(OnnxDtype2PopartType(dtype_));
-      auto tensor_name = op_desc->Output("__outputs__")[0];
+      auto dtype = PopartDType2VarType(
+          OnnxDType2PopartType(static_cast<ONNXDataType>(dtype_)));
+      auto tensor_name = GetOpOutputs(op_desc).front();
       auto* var = kid_scope.Var(tensor_name);
       VLOG(10) << "lowering constant: " << tensor_name;
       auto* tensor = var->GetMutable<framework::LoDTensor>();
@@ -270,13 +392,12 @@ void Compiler::LowerConstants(const Scope* scope) {
       tensor->Resize(ddim);
 
       auto const_data = std::unique_ptr<popart::ConstVoidData>();
-      popart::TensorInfo tensor_info(PdDataType2PopartType(tensor->dtype()),
+      popart::TensorInfo tensor_info(PhiDType2PopartDType(tensor->dtype()),
                                      shape);
       const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info));
-      PushNameScope(op_desc);
+      NameScopeHelper ns_helper(op_desc, builder_.get());
       popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
-      PopNameScope(op_desc);
-      SetIpuIndexStage(result, op_desc);
+      PostLower(result, op_desc);
       resources_->tensors.emplace(tensor_name, result);
     }
   }
@@ -285,42 +406,42 @@ void Compiler::LowerConstants(const Scope* scope) {
 
 void Compiler::LowerWeights(const Scope* scope) {
   VLOG(10) << "enter Compiler::LowerWeights";
-  // at this step, the graph doesn't contains optimizer related states
+  // At this step, the graph doesn't contains optimizer related states
   for (auto id : graph_helper_->sorted_vars_id) {
     auto* node = graph_helper_->nodes_id_map[id];
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      if (node->Var()->Persistable() && node->inputs.empty()) {
-        auto var_name = node->Var()->Name();
-        if (resources_->tensors.count(var_name) != 0) {
-          VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
-          continue;
-        }
-        if (var_name.rfind("learning_rate", 0) == 0) {
-          VLOG(10) << "skip learning_rate_var: " << var_name;
-          continue;
-        }
-        VLOG(10) << "lowering weight: " << var_name;
-
-        auto var = scope->FindVar(var_name);
-        if (var) {
-          auto tensor = var->Get<framework::LoDTensor>();
-          auto dtype = PdDataType2PopartType(tensor.dtype());
-          auto shape = std::vector<int64_t>();
-          for (size_t i = 0; i < tensor.dims().size(); ++i) {
-            shape.push_back(tensor.dims().at(i));
-          }
-          popart::TensorInfo tensor_info(dtype, shape);
-          popart::ConstVoidData const_data{tensor.data(), tensor_info};
-          if (!node->outputs.empty()) {
-            auto op_node = node->outputs[0];
-            PushNameScope(op_node->Op());
-            popart::TensorId result =
-                builder_->addInitializedInputTensor(const_data, var_name);
-            PopNameScope(op_node->Op());
-            resources_->tensors.emplace(var_name, result);
-            resources_->weights.push_back(var_name);
-          }
-        }
+    // Weights are var node and Persistable
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var() &&
+        node->Var()->Persistable()) {
+      // Weights are Parameter in training mode
+      if (ipu_strategy_->is_training && !node->Var()->IsParameter()) {
+        continue;
+      }
+      auto var_name = node->Var()->Name();
+      // Some op has same input and output tensor, like batchnorm
+      if (resources_->tensors.count(var_name) != 0) {
+        VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
+        continue;
+      }
+      VLOG(10) << "lowering weight: " << var_name;
+      auto var = scope->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound("Tensor %s is not found in the scope",
+                                          var_name));
+      auto tensor = var->Get<framework::LoDTensor>();
+      auto dtype = PhiDType2PopartDType(tensor.dtype());
+      auto shape = std::vector<int64_t>();
+      for (size_t i = 0; i < tensor.dims().size(); ++i) {
+        shape.push_back(tensor.dims().at(i));
+      }
+      popart::TensorInfo tensor_info(dtype, shape);
+      popart::ConstVoidData const_data{tensor.data(), tensor_info};
+      if (!node->outputs.empty()) {
+        auto op_node = node->outputs[0];
+        NameScopeHelper ns_helper(op_node->Op(), builder_.get());
+        popart::TensorId result =
+            builder_->addInitializedInputTensor(const_data, var_name);
+        resources_->tensors.emplace(var_name, result);
+        resources_->weights.push_back(var_name);
       }
     }
   }
@@ -340,12 +461,9 @@ void Compiler::LowerBody() {
       // pass
     } else if (op_type == "popart_checkpointoutput") {
       auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      PushNameScope(op_desc);
+      NameScopeHelper ns_helper(op_desc, builder_.get());
       auto output_ids = builder_->checkpointOutput(inputs);
-      PopNameScope(op_desc);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
+      PostLower(output_ids, op_desc);
     } else if (op_type == "popart_custom_op") {
       auto inputs = GetOpInputs(op_desc);
       auto outputs = GetOpOutputs(op_desc);
@@ -359,26 +477,21 @@ void Compiler::LowerBody() {
           BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
       VLOG(10) << "Build graph from custom op: " << __op_type;
       auto it = custom_ops_.find(__op_type);
-      PushNameScope(op_desc);
+      NameScopeHelper ns_helper(op_desc, builder_.get());
       auto output_ids =
           builder_->customOp(it->second.popart_op, it->second.popart_op.version,
                              inputs, outputs.size(), attributes, debug_context);
-      PopNameScope(op_desc);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
+      PostLower(output_ids, op_desc);
     } else if (op_type == "popart_printtensor") {
       auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
       auto debug_context = BuildDebugContext(op_desc);
       auto print_gradient =
           BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
       auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
-      PushNameScope(op_desc);
+      NameScopeHelper ns_helper(op_desc, builder_.get());
       auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
           inputs, print_gradient, debug_context, title);
-      PopNameScope(op_desc);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
+      PostLower(output_ids, op_desc);
     } else {
       auto itr = name_function_.find(op_type);
       if (itr != name_function_.end()) {
@@ -608,29 +721,13 @@ void Compiler::LowerOptimizer(const Scope* scope) {
   }
 }
 
-void Compiler::InsertTensors(const std::vector<std::string>& output_names,
-                             const std::vector<std::string>& tensor_ids) {
-  PADDLE_ENFORCE_EQ(output_names.size(), tensor_ids.size(),
-                    platform::errors::Fatal("InsertTensors size mismatch"));
-  for (int i = 0; i < tensor_ids.size(); i++) {
-    std::string tensor_id = tensor_ids[i];
-    resources_->tensors.emplace(output_names[i], tensor_ids[i]);
-  }
-}
-
-void Compiler::InsertTensors(const std::vector<std::string>& output_names,
-                             const std::string& tensor_id) {
-  PADDLE_ENFORCE_EQ(output_names.size(), 1,
-                    platform::errors::Fatal("InsertTensors size mismatch"));
-  resources_->tensors.emplace(output_names[0], tensor_id);
-}
-
-void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
-                                const OpDesc* op_desc) {
-  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+void Compiler::PostLower(const std::vector<std::string>& tensor_ids,
+                         const OpDesc* op_desc) {
+  // Set pipline
+  // Due to the limitation of popart, if an op has multiple outputs,
+  // pipline settings needs to be set at the same time
   auto tensor_ids_set =
       std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
-
   if (op_desc->HasAttr(sIpuIndexAttr)) {
     auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr));
     builder_->virtualGraph(tensor_ids_set, ipu_index);
@@ -639,18 +736,37 @@ void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
     if (op_desc->HasAttr(sIpuStageAttr)) {
       auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr));
       builder_->pipelineStage(tensor_ids_set, ipu_stage);
-      VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage
+      VLOG(10) << "set " << sIpuStageAttr << " = " << ipu_stage
                << " for op: " << op_desc->Type();
     }
   }
-  VLOG(10) << "leave Compiler::SetIpuIndexStage";
+  // Record output tensors
+  auto pd_outs = GetOpOutputs(op_desc);
+  PADDLE_ENFORCE_EQ(
+      pd_outs.size(), tensor_ids.size(),
+      platform::errors::Fatal("paddle and popart op have different outputs"));
+  for (int i = 0; i < tensor_ids.size(); ++i) {
+    resources_->tensors.emplace(pd_outs[i], tensor_ids[i]);
+  }
+  for (auto& tensor_id : tensor_ids) {
+    PostLower(tensor_id, op_desc, true);
+  }
 }
 
-void Compiler::SetIpuIndexStage(const std::string& tensor_id,
-                                const OpDesc* op_desc) {
-  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+void Compiler::PostLower(const std::string& tensor_id, const OpDesc* op_desc) {
+  // Record output tensor
+  auto pd_outs = GetOpOutputs(op_desc);
+  PADDLE_ENFORCE_EQ(
+      pd_outs.size(), 1,
+      platform::errors::Fatal("paddle and popart op have different outputs"));
+  resources_->tensors.emplace(pd_outs[0], tensor_id);
+  PostLower(tensor_id, op_desc, false);
+}
 
-  if (op_desc->HasAttr(sIpuIndexAttr)) {
+void Compiler::PostLower(const std::string& tensor_id, const OpDesc* op_desc,
+                         bool skip_pipline) {
+  // Set pipline
+  if (!skip_pipline && op_desc->HasAttr(sIpuIndexAttr)) {
     auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr));
     builder_->virtualGraph(tensor_id, ipu_index);
     VLOG(10) << "set " << sIpuIndexAttr << " = " << ipu_index
@@ -658,32 +774,18 @@ void Compiler::SetIpuIndexStage(const std::string& tensor_id,
     if (op_desc->HasAttr(sIpuStageAttr)) {
       auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr));
       builder_->pipelineStage(tensor_id, ipu_stage);
-      VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage
+      VLOG(10) << "set " << sIpuStageAttr << " = " << ipu_stage
                << " for op: " << op_desc->Type();
     }
   }
-  VLOG(10) << "leave Compiler::SetIpuIndexStage";
-}
-
-void Compiler::SetAMPAttributes(const std::vector<std::string>& tensor_ids,
-                                const OpDesc* op_desc) {
-  if (op_desc->Type() == "popart_matmul") {
-    for (const auto& tensor_id : tensor_ids) {
-      SetAMPAttributes(tensor_id, op_desc);
-    }
-  }
-}
-
-void Compiler::SetAMPAttributes(const std::string& tensor_id,
-                                const OpDesc* op_desc) {
-  VLOG(10) << "enter Compiler::SetAMPAttributes";
+  // Set amp
   if (op_desc->Type() == "popart_matmul") {
     if (set_amp_for_all_) {
       auto amp = ipu_strategy_->available_memory_proportion;
       if (amp < 0.0f || amp > 1.0) {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "AvailableMemoryProportion %f is invalid, which should be set 0 <= "
-            "amp <= 1",
+            "AvailableMemoryProportion %f is invalid, which should be in "
+            "range [0.0, 1.0]",
             amp));
       }
       if (amp > 0.0f) {
@@ -694,8 +796,8 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id,
         auto amp = BOOST_GET_CONST(float, op_desc->GetAttr(sAvailMemAttribute));
         if (amp < 0.0f || amp > 1.0) {
           PADDLE_THROW(platform::errors::InvalidArgument(
-              "AvailableMemoryProportion %f is invalid, which should be set 0 "
-              "<= amp <= 1",
+              "AvailableMemoryProportion %f is invalid, which should be in "
+              "range [0.0, 1.0]",
               amp));
         }
         if (amp > 0.0f) {
@@ -705,17 +807,7 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id,
         }
       }
     }
-  }
-  VLOG(10) << "leave Compiler::SetAMPAttributes";
-}
-
-void Compiler::SetSerializeAttributes(
-    const std::vector<std::string>& tensor_ids, const OpDesc* op_desc) {
-  VLOG(10) << "enter Compiler::SetSerializeAttributes";
-  auto tensor_ids_set =
-      std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
-
-  if (op_desc->Type() == "popart_matmul") {
+    // Set serialize matmul
     if (op_desc->HasAttr(sMatmulSerializeFactor)) {
       auto factor =
           BOOST_GET_CONST(int, op_desc->GetAttr(sMatmulSerializeFactor));
@@ -724,16 +816,9 @@ void Compiler::SetSerializeAttributes(
         mode = BOOST_GET_CONST(std::string,
                                op_desc->GetAttr(sMatmulSerializeMode));
       }
-      builder_->setSerializeMatMul(tensor_ids_set, mode, (int64_t)factor, true);
+      builder_->setSerializeMatMul({tensor_id}, mode, factor, true);
     }
   }
-  VLOG(10) << "leave Compiler::SetSerializeAttributes";
-}
-
-void Compiler::SetSerializeAttributes(const std::string& tensor_id,
-                                      const OpDesc* op_desc) {
-  std::vector<std::string> tensor_ids = {tensor_id};
-  SetSerializeAttributes(tensor_ids, op_desc);
 }
 
 void Compiler::SetCustomOps(
@@ -749,13 +834,7 @@ std::string Compiler::GetFP16ModelProto() {
   return graph_transformer.getModelProto();
 }
 
-std::string Compiler::GetModelProto() {
-  if (ipu_strategy_->enable_fp16) {
-    return GetFP16ModelProto();
-  } else {
-    return builder_->getModelProto();
-  }
-}
+std::string Compiler::GetModelProto() { return builder_->getModelProto(); }
 
 void Compiler::SaveModelProto(const std::string& path) {
   builder_->saveModelProto(path);
@@ -793,29 +872,6 @@ popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) {
   return popart::DebugContext(op_identify_id);
 }
 
-void Compiler::PushNameScope(const OpDesc* op) {
-  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
-  if (op_namescope == "/") {
-    return;
-  }
-  if (!op_namescope.empty()) {
-    op_namescope.pop_back();
-  }
-  if (!op_namescope.empty()) {
-    op_namescope.erase(op_namescope.begin());
-  }
-  VLOG(10) << "name_scope is: " << op_namescope;
-  builder_->pushNameScope(op_namescope);
-}
-
-void Compiler::PopNameScope(const OpDesc* op) {
-  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
-  if (op_namescope == "/") {
-    return;
-  }
-  builder_->popNameScope();
-}
-
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
index 2d00970bf1297..6f4e602af82df 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -17,16 +17,15 @@
 #include <popart/builder.hpp>
 #include <popart/graphtransformer.hpp>
 #include <popart/optimizer.hpp>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device/ipu/ipu_names.h"
-#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+class IpuStrategy;
+
 struct CompilerResources {
   // popart input tensor_ids
   std::vector<popart::TensorId> inputs;
@@ -70,7 +69,7 @@ struct CompilerResources {
   std::unique_ptr<popart::Optimizer> optimizer;
 };
 
-// helper for lowering graph
+// Helper for lowering graph
 struct GraphHelper {
   explicit GraphHelper(const Graph *);
 
@@ -114,23 +113,9 @@ class Compiler {
   const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
   const std::string GetNameScope(const OpDesc *op);
   popart::DebugContext BuildDebugContext(const OpDesc *op);
-
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::vector<std::string> &tensor_ids);
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::string &tensor_id);
-  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
-                              const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::string &tensor_id,
-                              const OpDesc *op_desc);
-  void PushNameScope(const OpDesc *op);
-  void PopNameScope(const OpDesc *op);
+  void PostLower(const std::vector<std::string> &, const OpDesc *);
+  void PostLower(const std::string &, const OpDesc *);
+  void PostLower(const std::string &, const OpDesc *, bool);
 
  private:
   std::unique_ptr<popart::Builder> builder_;
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc
index 2459f5140eb5b..b7a83b2ef1a61 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_device.cc
@@ -13,14 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_device.h"
+
+#include <popart/devicemanager.hpp>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-// TODO(alleng) merge with ipu_utils
-static bool GetBoolEnv(std::string str) {
+namespace {
+const bool GetBoolEnv(const std::string& str) {
   char* str_val = getenv(str.c_str());
   if (str_val == NULL) {
     return false;
@@ -32,6 +35,7 @@ static bool GetBoolEnv(std::string str) {
     return val;
   }
 }
+}  // namespace
 
 int GetNumDevices() {
   bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.h b/paddle/fluid/platform/device/ipu/ipu_device.h
index d39feffc92655..c6876c032c8e4 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.h
+++ b/paddle/fluid/platform/device/ipu/ipu_device.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <popart/devicemanager.hpp>
+#include <vector>
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 649b291244110..b020e4f219743 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -14,12 +14,80 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_executor.h"
 
-using float16 = paddle::platform::float16;
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/device/ipu/ipu_names.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+namespace {
+
+// Get paddle prefix and popart postfix of weight states
+// Format: {popart_postfix, paddle_prefix}
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    const std::string &opt_type) {
+  std::vector<std::pair<std::string, std::string>> pre_post_fix;
+  // Weight self
+  pre_post_fix.push_back(std::make_pair("", ""));
+
+  // Weight states
+  // TODO(alleng) support pair("Accl1___", "_moment1_{id!=0}")
+  if (opt_type == "adam" || opt_type == "lamb" || opt_type == "adamw") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
+    pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
+    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
+  } else if (opt_type == "momentum") {
+    pre_post_fix.push_back(std::make_pair("Accl___", "_velocity_0"));
+  } else if (opt_type == "adamax") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment_0"));
+    pre_post_fix.push_back(std::make_pair("Accl2___", "_inf_norm__0"));
+    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
+  } else if (opt_type == "adagrad") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment_0"));
+  } else if (opt_type == "adadelta") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "__avg_squared_grad_0"));
+    pre_post_fix.push_back(
+        std::make_pair("Accl2___", "__avg_squared_update_0"));
+  } else if (opt_type == "rmsprop") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_mean_square_0"));
+    pre_post_fix.push_back(std::make_pair("Accl2___", "_mean_grad_0"));
+    pre_post_fix.push_back(std::make_pair("Accl3___", "_momentum__0"));
+  }
+  return pre_post_fix;
+}
+
+class PdIArray final : public popart::IArray {
+ public:
+  explicit PdIArray(const Tensor *tensor) {
+    tensor_.ShareDataWith(*tensor);
+    for (int i = 0; i < tensor->dims().size(); ++i) {
+      shape_.push_back(tensor->dims().at(i));
+    }
+  }
+
+ public:
+  void *data() { return tensor_.data(); }
+  popart::DataType dataType() const {
+    return PhiDType2PopartDType(tensor_.dtype());
+  }
+  std::size_t rank() const { return tensor_.dims().size(); }
+  int64_t dim(size_t index) const { return tensor_.dims().at(index); }
+  std::size_t nelms() const {
+    return std::accumulate(shape_.begin(), shape_.end(),
+                           static_cast<int64_t>(1), std::multiplies<int64_t>());
+  }
+  const popart::Shape shape() const { return shape_; }
+
+ private:
+  Tensor tensor_;
+  std::vector<int64_t> shape_;
+};
+
+}  // namespace
+
 Executor::~Executor() {
   Detach();
   session_.reset();
@@ -76,15 +144,15 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   VLOG(10) << "enter Executor::Run";
   // inputs
   std::map<popart::TensorId, popart::IArray &> popart_inputs;
-  std::map<popart::TensorId, PaddleIArray> input_wrappers;
+  std::map<popart::TensorId, PdIArray> input_wrappers;
   for (size_t i = 0; i < inputs.size(); i++) {
     auto tensor_id = compiler_resources_->inputs[i];
-    input_wrappers.emplace(tensor_id, PaddleIArray(inputs[i]));
+    input_wrappers.emplace(tensor_id, PdIArray(inputs[i]));
     popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id));
   }
   // anchors
   std::map<popart::TensorId, popart::IArray &> popart_anchors;
-  std::map<popart::TensorId, PaddleIArray> anchor_wrappers;
+  std::map<popart::TensorId, PdIArray> anchor_wrappers;
   for (size_t i = 0; i < outputs.size(); i++) {
     auto tensor_id = compiler_resources_->outputs[i];
     // get dims & dtype from session
@@ -106,10 +174,10 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
     auto *tensor = outputs[i];
     tensor->Resize(phi::make_ddim(output_shape));
     auto fetch_dtype = fetch_info.dataType();
-    auto paddle_type = PopartType2VarType(fetch_dtype);
+    auto paddle_type = PopartDType2VarType(fetch_dtype);
     tensor->mutable_data(ctx.GetPlace(),
                          framework::TransToPhiDataType(paddle_type));
-    anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor));
+    anchor_wrappers.emplace(tensor_id, PdIArray(tensor));
     popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
   }
   VLOG(10) << "Prepared inputs/anchors";
@@ -169,16 +237,16 @@ void Executor::AcquireDevice() {
     device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById(
         device_id);
     PADDLE_ENFORCE_NOT_NULL(
-        device_, platform::errors::Unavailable(
-                     "Can't attach IPU in distribution, ipu_num = %d.",
-                     RequestIpus(ipu_strategy_->num_ipus)));
+        device_,
+        errors::Unavailable("Can't attach IPU in distribution, ipu_num = %d.",
+                            RequestIpus(ipu_strategy_->num_ipus)));
   } else {
     device_ =
         popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
             RequestIpus(ipu_strategy_->num_ipus));
-    PADDLE_ENFORCE_NOT_NULL(device_, platform::errors::Unavailable(
-                                         "Can't attach IPU, ipu_num = %d.",
-                                         RequestIpus(ipu_strategy_->num_ipus)));
+    PADDLE_ENFORCE_NOT_NULL(
+        device_, errors::Unavailable("Can't attach IPU, ipu_num = %d.",
+                                     RequestIpus(ipu_strategy_->num_ipus)));
   }
   VLOG(10) << "leave Executor::AcquireDevice";
 }
@@ -226,13 +294,13 @@ void Executor::SetWeightsIO() {
 void Executor::ConvertWeights(bool align_to_popart) {
   for (auto weight_pair : executor_resources_->weights_and_opt_state) {
     auto paddle_var = scope_->GetVar(weight_pair.second);
-    auto paddle_var_dtype = PdDataType2PopartType(
+    auto paddle_var_dtype = PhiDType2PopartDType(
         paddle_var->GetMutable<framework::LoDTensor>()->dtype());
 
     PADDLE_ENFORCE_EQ((paddle_var_dtype == popart::DataType::FLOAT ||
                        paddle_var_dtype == popart::DataType::FLOAT16),
                       true,
-                      platform::errors::InvalidArgument(
+                      errors::InvalidArgument(
                           "Currently, we only support FLOAT16 and FLOAT with "
                           "Paddle, but received type is %s.",
                           paddle_var_dtype));
@@ -242,7 +310,7 @@ void Executor::ConvertWeights(bool align_to_popart) {
     PADDLE_ENFORCE_EQ((popart_var_dtype == popart::DataType::FLOAT ||
                        popart_var_dtype == popart::DataType::FLOAT16),
                       true,
-                      platform::errors::InvalidArgument(
+                      errors::InvalidArgument(
                           "Currently, we only support FLOAT16 and FLOAT with "
                           "popart, but received type is %s.",
                           popart_var_dtype));
@@ -276,8 +344,8 @@ void Executor::ConvertWeights(bool align_to_popart) {
                num_elem * sizeof(float));
       }
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Convert Paddle FLOAT16 to popart FLOAT"));
+      PADDLE_THROW(
+          errors::Unimplemented("Convert Paddle FLOAT16 to popart FLOAT"));
     }
   }
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
index c59e623ab20b0..c03a52a77a9d7 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.h
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -22,17 +22,21 @@ limitations under the License. */
 #include <popart/tensorinfo.hpp>
 #include <popdist/popdist_poplar.hpp>
 
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
-#include "paddle/fluid/platform/device/ipu/ipu_names.h"
-#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+struct CompilerResources;
+class IpuStrategy;
+
 struct ExecutorResources {
   // map<tensor_id, paddle_var_ptr>
   popart::WeightsIO weights_io;
@@ -45,18 +49,18 @@ class Executor {
   Executor() = default;
   ~Executor();
 
-  // build popart session
+  // Build popart session
   void Prepare(const std::string &proto);
 
-  // run popart session
+  // Run popart session
   void Run(const std::vector<const Tensor *> &inputs,
            const std::vector<Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
-  // sync weights from popart to paddle
+  // Sync weights from popart to paddle
   void WeightsToHost();
 
-  // detach IPU
+  // Detach IPU
   void Detach();
 
   // Scope
@@ -83,16 +87,16 @@ class Executor {
   void WeightsToPaddle();
 
  private:
-  // not own
+  // Not own
   const Scope *scope_ = nullptr;
   const IpuStrategy *ipu_strategy_ = nullptr;
   CompilerResources *compiler_resources_ = nullptr;
 
-  // deviceinfo for popart session
+  // Deviceinfo for popart session
   std::shared_ptr<popart::DeviceInfo> device_;
-  // popart session, where graph running
+  // Popart session, where graph running
   std::unique_ptr<popart::Session> session_;
-  // one OneSession means a graph
+  // A ExecutorResources corresponds to a graph
   std::unique_ptr<ExecutorResources> executor_resources_;
 };
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc
index 9e6951c37139d..749628ffac452 100644
--- a/paddle/fluid/platform/device/ipu/ipu_info.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
+
 #include "paddle/fluid/platform/device/ipu/ipu_device.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index f52499a8d8fda..aff5498243000 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -64,7 +64,6 @@ IpuStrategy::IpuStrategy() {
   ADD_BOOL_OPTION(is_training);
   ADD_BOOL_OPTION(need_avg_shard);
   ADD_BOOL_OPTION(enable_fp16);
-  ADD_BOOL_OPTION(transfer_cast_op);
   ADD_BOOL_OPTION(use_no_bias_optimizer);
   ADD_BOOL_OPTION(enable_distribution);
   ADD_BOOL_OPTION(scaled_optimizer_state);
@@ -316,8 +315,10 @@ IpuStrategy::IpuStrategy() {
   RegisterSetter(bool_options, "enable_half_partial", [&](bool value) {
     if (value) {
       popart_options.partialsTypeMatMuls = "half";
+      popart_options.convolutionOptions.insert({{"partialsType", "half"}});
     } else {
       popart_options.partialsTypeMatMuls = "float";
+      popart_options.convolutionOptions.insert({{"partialsType", "float"}});
     }
   });
 
@@ -412,6 +413,15 @@ IpuStrategy::IpuStrategy() {
 
   RegisterGetter(map_options_getter, options_type, "gcl_options", "map",
                  [&]() { return popart_options.gclOptions; });
+
+  // Default options
+
+  // Can also be set as a custom logger in python, like using tqdm
+  popart_options.compilationProgressLogger = [](int progress, int total) {
+    if (progress % 10 == 0) {
+      VLOG(1) << "compile progress: " << progress << "%";
+    }
+  };
 }
 
 void IpuStrategy::AddBoolOption(const std::string& option, bool value) {
@@ -513,6 +523,11 @@ void IpuStrategy::AddCustomOp(const std::string& paddle_op,
       IpuCustomOpIdentifier(paddle_op, popart_op, domain, version));
 }
 
+void IpuStrategy::SetCompilationProgressLogger(
+    const std::function<void(int, int)>& logger) {
+  popart_options.compilationProgressLogger = logger;
+}
+
 std::string IpuStrategy::GetOption(const std::string& option) {
   return get(option, options_getter);
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 1802eb16e5895..fa57dcd676d81 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -43,9 +43,6 @@ class IpuStrategy {
   // Flag for fp16, true for pure fp16
   bool enable_fp16 = false;
 
-  // Enable transfer cast Op target from fp32 to fp16 in fp16 mode
-  bool transfer_cast_op = true;
-
   // The mode of Adam/Lamb optimizer
   // false: The standard Adam/Lamb optimizer
   // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART
@@ -125,6 +122,8 @@ class IpuStrategy {
                                           const std::vector<int> &values);
   void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
                    const std::string &domain, int version);
+  void SetCompilationProgressLogger(
+      const std::function<void(int, int)> &logger);
 
   std::string GetOption(const std::string &);
   std::vector<std::string> GetVectorOption(const std::string &);
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.cc b/paddle/fluid/platform/device/ipu/ipu_utils.cc
index 720de822608b6..43e4a6820c813 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc
@@ -13,133 +13,111 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+
 #include <cmath>
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-void* PaddleIArray::data() { return tensor_.data(); }
-
-popart::DataType PaddleIArray::dataType() const {
-  return PdDataType2PopartType(tensor_.dtype());
-}
-
-std::size_t PaddleIArray::rank() const { return tensor_.dims().size(); }
-
-int64_t PaddleIArray::dim(size_t index) const {
-  return tensor_.dims().at(index);
-}
-
-std::size_t PaddleIArray::nelms() const {
-  return std::accumulate(shape_.begin(), shape_.end(), static_cast<int64_t>(1),
-                         std::multiplies<int64_t>());
-}
-
-const popart::Shape PaddleIArray::shape() const { return shape_; }
-
-popart::DataType VarType2PopartType(
-    const framework::proto::VarType::Type type) {
+const popart::DataType VarType2PopartDType(const VarType::Type type) {
   switch (type) {
-    case framework::proto::VarType::UINT8:
+    case VarType::UINT8:
       return popart::DataType::UINT8;
-    case framework::proto::VarType::INT8:
+    case VarType::INT8:
       return popart::DataType::INT8;
-    case framework::proto::VarType::INT16:
+    case VarType::INT16:
       return popart::DataType::INT16;
-    case framework::proto::VarType::INT32:
+    case VarType::INT32:
       return popart::DataType::INT32;
-    case framework::proto::VarType::INT64:
+    case VarType::INT64:
       return popart::DataType::INT64;
-    case framework::proto::VarType::BOOL:
+    case VarType::BOOL:
       return popart::DataType::BOOL;
-    case framework::proto::VarType::FP64:
+    case VarType::FP64:
       return popart::DataType::DOUBLE;
-    case framework::proto::VarType::FP32:
+    case VarType::FP32:
       return popart::DataType::FLOAT;
-    case framework::proto::VarType::FP16:
+    case VarType::FP16:
       return popart::DataType::FLOAT16;
-    case framework::proto::VarType::BF16:
+    case VarType::BF16:
       return popart::DataType::BFLOAT16;
-    case framework::proto::VarType::COMPLEX64:
+    case VarType::COMPLEX64:
       return popart::DataType::COMPLEX64;
-    case framework::proto::VarType::COMPLEX128:
+    case VarType::COMPLEX128:
       return popart::DataType::COMPLEX128;
     default:
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported Paddle var type."));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported VarType::Type when converting to popart data type."));
   }
 }
 
-popart::DataType PdDataType2PopartType(
-    const paddle::experimental::DataType type) {
+const popart::DataType PhiDType2PopartDType(const phi::DataType type) {
   switch (type) {
-    case paddle::experimental::DataType::UINT8:
+    case phi::DataType::UINT8:
       return popart::DataType::UINT8;
-    case paddle::experimental::DataType::INT8:
+    case phi::DataType::INT8:
       return popart::DataType::INT8;
-    case paddle::experimental::DataType::INT16:
+    case phi::DataType::INT16:
       return popart::DataType::INT16;
-    case paddle::experimental::DataType::INT32:
+    case phi::DataType::INT32:
       return popart::DataType::INT32;
-    case paddle::experimental::DataType::INT64:
+    case phi::DataType::INT64:
       return popart::DataType::INT64;
-    case paddle::experimental::DataType::BOOL:
+    case phi::DataType::BOOL:
       return popart::DataType::BOOL;
-    case paddle::experimental::DataType::FLOAT64:
+    case phi::DataType::FLOAT64:
       return popart::DataType::DOUBLE;
-    case paddle::experimental::DataType::FLOAT32:
+    case phi::DataType::FLOAT32:
       return popart::DataType::FLOAT;
-    case paddle::experimental::DataType::FLOAT16:
+    case phi::DataType::FLOAT16:
       return popart::DataType::FLOAT16;
-    case paddle::experimental::DataType::BFLOAT16:
+    case phi::DataType::BFLOAT16:
       return popart::DataType::BFLOAT16;
-    case paddle::experimental::DataType::COMPLEX64:
+    case phi::DataType::COMPLEX64:
       return popart::DataType::COMPLEX64;
-    case paddle::experimental::DataType::COMPLEX128:
+    case phi::DataType::COMPLEX128:
       return popart::DataType::COMPLEX128;
     default:
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported Paddle data type."));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported phi::DataType when converting to popart data type."));
   }
 }
 
-framework::proto::VarType::Type PopartType2VarType(
-    const popart::DataType type) {
+const VarType::Type PopartDType2VarType(const popart::DataType type) {
   switch (type) {
     case popart::DataType::UINT8:
-      return framework::proto::VarType::UINT8;
+      return VarType::UINT8;
     case popart::DataType::INT8:
-      return framework::proto::VarType::INT8;
+      return VarType::INT8;
     case popart::DataType::INT16:
-      return framework::proto::VarType::INT16;
+      return VarType::INT16;
     case popart::DataType::INT32:
-      return framework::proto::VarType::INT32;
+      return VarType::INT32;
     case popart::DataType::INT64:
-      return framework::proto::VarType::INT64;
+      return VarType::INT64;
     case popart::DataType::BOOL:
-      return framework::proto::VarType::BOOL;
+      return VarType::BOOL;
     case popart::DataType::DOUBLE:
-      return framework::proto::VarType::FP64;
+      return VarType::FP64;
     case popart::DataType::FLOAT:
-      return framework::proto::VarType::FP32;
+      return VarType::FP32;
     case popart::DataType::FLOAT16:
-      return framework::proto::VarType::FP16;
+      return VarType::FP16;
     case popart::DataType::BFLOAT16:
-      return framework::proto::VarType::BF16;
+      return VarType::BF16;
     case popart::DataType::COMPLEX64:
-      return framework::proto::VarType::COMPLEX64;
+      return VarType::COMPLEX64;
     case popart::DataType::COMPLEX128:
-      return framework::proto::VarType::COMPLEX128;
+      return VarType::COMPLEX128;
     default:
-      PADDLE_THROW(paddle::platform::errors::Unavailable(
-          "Unsupported Paddle var type."));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported popart::DataType when converting to var type."));
   }
 }
 
-popart::DataType OnnxDtype2PopartType(const int type) {
-  auto dtype = static_cast<ONNXDataType>(type);
-  switch (dtype) {
+const popart::DataType OnnxDType2PopartType(const ONNXDataType type) {
+  switch (type) {
     case ONNXDataType::BOOL:
       return popart::DataType::BOOL;
     case ONNXDataType::INT16:
@@ -166,12 +144,69 @@ popart::DataType OnnxDtype2PopartType(const int type) {
       return popart::DataType::COMPLEX128;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported ONNX data type: %d.", dtype));
+          "Unsupported ONNXDataType when converting to popart data type."));
   }
 }
 
-// count num should > 0
-bool GetBoolEnv(std::string str) {
+const ONNXDataType VarType2OnnxDType(const VarType::Type type) {
+  switch (type) {
+    case VarType::BOOL:
+      return ONNXDataType::BOOL;
+    case VarType::INT16:
+      return ONNXDataType::INT16;
+    case VarType::INT32:
+      return ONNXDataType::INT32;
+    case VarType::INT64:
+      return ONNXDataType::INT64;
+    case VarType::FP16:
+      return ONNXDataType::FLOAT16;
+    case VarType::FP32:
+      return ONNXDataType::FLOAT;
+    case VarType::FP64:
+      return ONNXDataType::DOUBLE;
+    case VarType::UINT8:
+      return ONNXDataType::UINT8;
+    case VarType::INT8:
+      return ONNXDataType::INT8;
+    case VarType::BF16:
+      return ONNXDataType::BFLOAT16;
+    case VarType::COMPLEX64:
+      return ONNXDataType::COMPLEX64;
+    case VarType::COMPLEX128:
+      return ONNXDataType::COMPLEX128;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported VarType::Type when converting to onnx data type."));
+  }
+}
+
+const std::string VarType2PopartStr(const VarType::Type type) {
+  switch (type) {
+    case VarType::UINT8:
+      return "UINT8";
+    case VarType::INT8:
+      return "INT8";
+    case VarType::INT16:
+      return "INT16";
+    case VarType::INT32:
+      return "INT32";
+    case VarType::INT64:
+      return "INT64";
+    case VarType::BOOL:
+      return "BOOL";
+    case VarType::FP64:
+      return "DOUBLE";
+    case VarType::FP32:
+      return "FLOAT";
+    case VarType::FP16:
+      return "FLOAT16";
+    default:
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Unsupported VarType::Type when converting to popart type string."));
+  }
+}
+
+const bool GetBoolEnv(const std::string& str) {
   char* str_val = getenv(str.c_str());
   if (str_val == NULL) {
     return false;
@@ -184,29 +219,7 @@ bool GetBoolEnv(std::string str) {
   }
 }
 
-std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
-    const std::string& opt_type) {
-  // format: {popart_tensor_id, paddle_tensor_id}, ...
-  std::vector<std::pair<std::string, std::string>> pre_post_fix;
-
-  if (opt_type == "adam" || opt_type == "lamb") {
-    pre_post_fix.push_back(std::make_pair("", ""));
-    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
-    pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
-    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
-  } else if (opt_type == "sgd" || opt_type == "momentum") {
-    // sgd
-    pre_post_fix.push_back(std::make_pair("", ""));
-  } else {
-    pre_post_fix.push_back(std::make_pair("", ""));
-    //
-  }
-
-  return pre_post_fix;
-}
-
-int RequestIpus(const int num_ipus) {
-  // num_ipus must be pow(2, n);
+const int RequestIpus(const int num_ipus) {
   return std::pow(2, ceil(log2(num_ipus)));
 }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h
index 7644513cc0207..2737f40295390 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.h
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -19,155 +19,32 @@ limitations under the License. */
 #include <popart/tensorinfo.hpp>
 #include <popart/vendored/any.hpp>
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/float16.h"
 
+using float16 = paddle::platform::float16;
+using Tensor = paddle::framework::Tensor;
+using LoDTensor = paddle::framework::LoDTensor;
+using Scope = paddle::framework::Scope;
+using OpDesc = paddle::framework::OpDesc;
+using Graph = paddle::framework::ir::Graph;
+using Node = paddle::framework::ir::Node;
+using BlockDesc = paddle::framework::BlockDesc;
+using VarType = paddle::framework::proto::VarType;
+
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-using float16 = platform::float16;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using Scope = framework::Scope;
-using OpDesc = framework::OpDesc;
-using Graph = framework::ir::Graph;
-using Node = framework::ir::Node;
-using BlockDesc = framework::BlockDesc;
-
-// onnx dtype
-// https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3
-enum ONNXDataType : int {
-  UNDEFINED = 0,
-  FLOAT = 1,
-  UINT8 = 2,
-  INT8 = 3,
-  UINT16 = 4,
-  INT16 = 5,
-  INT32 = 6,
-  INT64 = 7,
-  STRING = 8,
-  BOOL = 9,
-  FLOAT16 = 10,
-  DOUBLE = 11,
-  UINT32 = 12,
-  UINT64 = 13,
-  COMPLEX64 = 14,
-  COMPLEX128 = 15,
-  BFLOAT16 = 16
-};
-
-class PaddleIArray final : public popart::IArray {
- public:
-  explicit PaddleIArray(const Tensor* tensor) {
-    tensor_.ShareDataWith(*tensor);
-    for (int i = 0; i < tensor->dims().size(); ++i) {
-      shape_.push_back(tensor->dims().at(i));
-    }
-  }
-
- public:
-  void* data();
-  popart::DataType dataType() const;
-  std::size_t rank() const;
-  int64_t dim(size_t index) const;
-  std::size_t nelms() const;
-  const popart::Shape shape() const;
-
- private:
-  Tensor tensor_;
-  std::vector<int64_t> shape_;
-};
-
-popart::DataType VarType2PopartType(const framework::proto::VarType::Type type);
-popart::DataType PdDataType2PopartType(
-    const paddle::experimental::DataType type);
-framework::proto::VarType::Type PopartType2VarType(const popart::DataType type);
-popart::DataType OnnxDtype2PopartType(const int type);
-bool GetBoolEnv(std::string str);
-
-template <typename T>
-std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(const Tensor& tensor) {
-  auto dtype = PdDataType2PopartType(tensor.dtype());
-  auto shape = std::vector<int64_t>();
-  for (size_t i = 0; i < tensor.dims().size(); ++i) {
-    shape.push_back(tensor.dims().at(i));
-  }
-  popart::TensorInfo tensor_info(dtype, shape);
-
-  return std::make_unique<popart::NDArrayWrapper<T>>(
-      reinterpret_cast<T*>(tensor.data()), tensor_info);
-}
-
-template <typename T>
-std::unique_ptr<popart::NDArrayWrapper<T>> LoDTensor2IArray(
-    LoDTensor const& lod_tensor) {
-  if (lod_tensor.lod().size() == 0) {
-    return Tensor2IArray<T>(lod_tensor);
-  } else {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("LoDTensor2IArray is Unimplemented"));
-  }
-}
-
 template <typename T>
 T GetSingleVarFromScope(const Scope* scope, const std::string& var_name) {
   auto var = scope->GetVar(var_name);
   auto tensor = var->Get<framework::LoDTensor>();
-  // check dtype is  ?
   return tensor.data<T>()[0];
 }
 
-struct CustomOpAttrVisitor : public boost::static_visitor<void> {
-  explicit CustomOpAttrVisitor(std::map<std::string, popart::any>* attr,
-                               const std::string& attr_name)
-      : attrs_(attr), attr_name_(attr_name) {}
-  mutable std::map<std::string, popart::any>* attrs_;
-  std::string attr_name_;
-
-  void operator()(int v) const { attrs_->emplace(attr_name_, v); }
-  void operator()(float v) const { attrs_->emplace(attr_name_, v); }
-  void operator()(const std::string& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(const std::vector<int>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(const std::vector<float>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(const std::vector<std::string>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(bool v) const { attrs_->emplace(attr_name_, v); }
-  void operator()(const std::vector<bool>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(BlockDesc* desc) const {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Unsupported calling method for `BlockDesc` type."));
-  }
-  void operator()(const std::vector<BlockDesc*>& v) const {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Unsupported calling method for `BlockDesc` type."));
-  }
-  void operator()(int64_t v) const { attrs_->emplace(attr_name_, v); }
-  void operator()(const std::vector<int64_t>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(const std::vector<double>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(boost::blank) const {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Unsupported calling method for `boost::blank` type."));
-  }
-};
-
 struct IpuCustomOpIdentifier {
   IpuCustomOpIdentifier(const std::string& _paddle_op,
                         const std::string& _popart_op,
@@ -185,54 +62,44 @@ struct IpuCustomOpIdentifier {
   popart::OperatorIdentifier popart_op;
 };
 
-struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
-  explicit ConstantOpAttrVisitor(framework::LoDTensor* tensor,
-                                 framework::proto::VarType::Type dtype)
-      : tensor_(tensor), dtype_(dtype) {}
-  framework::LoDTensor* tensor_;
-  framework::proto::VarType::Type dtype_;
-
-  void operator()(const std::vector<int>& vec) const {
-    framework::TensorFromVector<int>(vec, tensor_);
-  }
-  void operator()(const std::vector<float>& vec) const {
-    if (dtype_ == framework::proto::VarType::FP16) {
-      std::vector<float16> vec_fp16;
-      std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16),
-                     [](float f) -> float16 { return float16(f); });
-      framework::TensorFromVector<float16>(vec_fp16, tensor_);
-    } else {
-      framework::TensorFromVector<float>(vec, tensor_);
-    }
-  }
-  void operator()(const std::vector<bool>& vec) const {
-    framework::TensorFromVector<bool>(vec, tensor_);
-  }
-  void operator()(const std::vector<int64_t>& vec) const {
-    framework::TensorFromVector<int64_t>(vec, tensor_);
-  }
-  void operator()(const std::vector<double>& vec) const {
-    framework::TensorFromVector<double>(vec, tensor_);
-  }
-  void RaiseError() const {
-    PADDLE_THROW(
-        platform::errors::InvalidArgument("Constant value must be a vector"));
-  }
-  void operator()(int v) const { RaiseError(); }
-  void operator()(float v) const { RaiseError(); }
-  void operator()(const std::string& v) const { RaiseError(); }
-  void operator()(const std::vector<std::string>& v) const { RaiseError(); }
-  void operator()(bool v) const { RaiseError(); }
-  void operator()(BlockDesc* desc) const { RaiseError(); }
-  void operator()(const std::vector<BlockDesc*>& v) const { RaiseError(); }
-  void operator()(int64_t v) const { RaiseError(); }
-  void operator()(boost::blank) const { RaiseError(); }
+// Onnx dtype
+// https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3
+enum ONNXDataType : int {
+  UNDEFINED = 0,
+  FLOAT = 1,
+  UINT8 = 2,
+  INT8 = 3,
+  UINT16 = 4,
+  INT16 = 5,
+  INT32 = 6,
+  INT64 = 7,
+  STRING = 8,
+  BOOL = 9,
+  FLOAT16 = 10,
+  DOUBLE = 11,
+  UINT32 = 12,
+  UINT64 = 13,
+  COMPLEX64 = 14,
+  COMPLEX128 = 15,
+  BFLOAT16 = 16
 };
 
-std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
-    const std::string& opt_type);
-
-int RequestIpus(const int num_ipus);
+// VarType::Type to popart::DataType
+const popart::DataType VarType2PopartDType(const VarType::Type type);
+// phi::DataType to popart::DataType
+const popart::DataType PhiDType2PopartDType(const phi::DataType type);
+// popart::DataType to VarType::Type
+const VarType::Type PopartDType2VarType(const popart::DataType type);
+// ONNXDataType to popart::DataType
+const popart::DataType OnnxDType2PopartType(const ONNXDataType type);
+// VarType::Type to ONNXDataType
+const ONNXDataType VarType2OnnxDType(const VarType::Type type);
+// VarType::Type to String in Popart
+const std::string VarType2PopartStr(const VarType::Type type);
+// Get bool from envirnment varaible
+const bool GetBoolEnv(const std::string& str);
+// Request number of ipus must be pow(2, n)
+const int RequestIpus(const int num_ipus);
 
 }  // namespace ipu
 }  // namespace platform
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
index ab9ddfde21873..254e566567424 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
@@ -56,15 +56,15 @@ Node *gelu_handler(Graph *graph, Node *node) {
     auto sqrt2 = CreateConst(graph, node, {}, {},
                              {{"value", std::vector<float>{1.4142135623730951}},
                               {"dims", std::vector<int64_t>{1}},
-                              {"dtype", GetOutputVarDtype(node)}});
+                              {"dtype", GetOutputVarDType(node)}});
     auto zero_point_five =
         CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0.5}},
                                           {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", GetOutputVarDtype(node)}});
+                                          {"dtype", GetOutputVarDType(node)}});
     auto one =
         CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{1}},
                                           {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", GetOutputVarDtype(node)}});
+                                          {"dtype", GetOutputVarDType(node)}});
     auto div =
         CreateBaseOp(graph, node, "popart_div",
                      {GetInputVarNode("X", node), sqrt2->outputs[0]}, {}, {});
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
index 3d22f75d345d6..7a14d23698def 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
@@ -18,7 +18,6 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-// This avoids the static initialisation order fiasco,
 std::unordered_map<std::string, SymbolHandler> &SymbolHandlers() {
   static std::unordered_map<std::string, SymbolHandler> symbol_handlers;
   return symbol_handlers;
@@ -34,8 +33,6 @@ bool RegisterHandler(const std::string &symbol, const SymbolHandler &handler) {
   return new_handler;
 }
 
-// Return a pointer to a handler if one is registered for this kind of node or
-// an empty std::function otherwise.
 SymbolHandler GetHandler(const std::string &kind) {
   auto it = SymbolHandlers().find(kind);
   if (it != SymbolHandlers().end()) {
@@ -84,66 +81,6 @@ void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
   }
 }
 
-const int VarType2OnnxDtype(const int type) {
-  auto dtype = static_cast<framework::proto::VarType::Type>(type);
-  switch (dtype) {
-    case framework::proto::VarType::BOOL:
-      return static_cast<int>(ONNXDataType::BOOL);
-    case framework::proto::VarType::INT16:
-      return static_cast<int>(ONNXDataType::INT16);
-    case framework::proto::VarType::INT32:
-      return static_cast<int>(ONNXDataType::INT32);
-    case framework::proto::VarType::INT64:
-      return static_cast<int>(ONNXDataType::INT64);
-    case framework::proto::VarType::FP16:
-      return static_cast<int>(ONNXDataType::FLOAT16);
-    case framework::proto::VarType::FP32:
-      return static_cast<int>(ONNXDataType::FLOAT);
-    case framework::proto::VarType::FP64:
-      return static_cast<int>(ONNXDataType::DOUBLE);
-    case framework::proto::VarType::UINT8:
-      return static_cast<int>(ONNXDataType::UINT8);
-    case framework::proto::VarType::INT8:
-      return static_cast<int>(ONNXDataType::INT8);
-    case framework::proto::VarType::BF16:
-      return static_cast<int>(ONNXDataType::BFLOAT16);
-    case framework::proto::VarType::COMPLEX64:
-      return static_cast<int>(ONNXDataType::COMPLEX64);
-    case framework::proto::VarType::COMPLEX128:
-      return static_cast<int>(ONNXDataType::COMPLEX128);
-    default:
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unsupported data type: %d.", dtype));
-  }
-}
-
-const std::string VarType2PopStr(const int type) {
-  auto dtype = static_cast<framework::proto::VarType::Type>(type);
-  switch (dtype) {
-    case framework::proto::VarType::UINT8:
-      return "UINT8";
-    case framework::proto::VarType::INT8:
-      return "INT8";
-    case framework::proto::VarType::INT16:
-      return "INT16";
-    case framework::proto::VarType::INT32:
-      return "INT32";
-    case framework::proto::VarType::INT64:
-      return "INT64";
-    case framework::proto::VarType::BOOL:
-      return "BOOL";
-    case framework::proto::VarType::FP64:
-      return "DOUBLE";
-    case framework::proto::VarType::FP32:
-      return "FLOAT";
-    case framework::proto::VarType::FP16:
-      return "FLOAT16";
-    default:
-      PADDLE_THROW(
-          paddle::platform::errors::Unavailable("Unsupported data type."));
-  }
-}
-
 Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
                       const int id) {
   auto var_name = op_node->Op()->Input(input_name).at(id);
@@ -180,7 +117,7 @@ const bool is_float_equal(float a, float b, float eps) {
   return std::fabs(a - b) <= eps;
 }
 
-const int GetOutputVarDtype(const Node *node, const std::string &output_name) {
+const int GetOutputVarDType(const Node *node, const std::string &output_name) {
   auto out_node = GetOutputVarNode(output_name, node);
   PADDLE_ENFORCE_NOT_NULL(out_node, platform::errors::Unavailable(
                                         "Node's out node does not exist."));
@@ -188,7 +125,7 @@ const int GetOutputVarDtype(const Node *node, const std::string &output_name) {
   PADDLE_ENFORCE_NOT_NULL(
       var, platform::errors::Unavailable("Node is not a variable."));
   auto proto_var_type = var->GetDataType();
-  return VarType2OnnxDtype(proto_var_type);
+  return static_cast<int>(VarType2OnnxDType(proto_var_type));
 }
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
index 32133e128c588..7ac6097e0cc14 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
@@ -68,9 +68,6 @@ void ClearNode(Node *node);
 void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
                 bool override = false);
 
-const int VarType2OnnxDtype(const int type);
-const std::string VarType2PopStr(const int type);
-
 Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
                       const int id = 0);
 Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
@@ -81,7 +78,7 @@ Node *GetOutputVarNodeByVarName(const std::string &var_name,
                                 const Node *op_node);
 
 const bool is_float_equal(float a, float b, float eps = 1e-8);
-const int GetOutputVarDtype(const Node *node,
+const int GetOutputVarDType(const Node *node,
                             const std::string &output_name = "Out");
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
index 6f82acb5b7db3..99fb76c950681 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
@@ -28,6 +28,14 @@ Node *equal_handler(Graph *graph, Node *node) {
   return new_node;
 }
 
+Node *not_equal_handler(Graph *graph, Node *node) {
+  auto equal_node = CreateBaseOp(
+      graph, node, "popart_equal",
+      {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, {});
+  return CreateBaseOp(graph, node, "popart_logical_not",
+                      {equal_node->outputs[0]}, node->outputs, {});
+}
+
 Node *logical_not_handler(Graph *graph, Node *node) {
   return CreateBaseOp(graph, node, "popart_logical_not",
                       {GetInputVarNode("X", node)},
@@ -64,6 +72,7 @@ Node *less_than_handler(Graph *graph, Node *node) {
 }  // namespace paddle
 
 REGISTER_HANDLER(equal, equal_handler);
+REGISTER_HANDLER(not_equal, not_equal_handler);
 REGISTER_HANDLER(logical_not, logical_not_handler);
 REGISTER_HANDLER(logical_or, logical_or_handler);
 REGISTER_HANDLER(logical_and, logical_and_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index 444b55959cf22..af72f84c9d771 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -41,7 +41,7 @@ Node *pow_handler(Graph *graph, Node *node) {
     // Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
     auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor"));
     auto attrs =
-        MakeConstAttrMapFromValue<float>(value_, {1}, GetOutputVarDtype(node));
+        MakeConstAttrMapFromValue<float>(value_, {1}, GetOutputVarDType(node));
 
     auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
     return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
@@ -134,7 +134,7 @@ Node *matmul_handler(Graph *graph, Node *node) {
   } else {
     auto o_node =
         CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {});
-    auto attr = MakeConstAttrMapFromValue(alpha, {1}, GetOutputVarDtype(node));
+    auto attr = MakeConstAttrMapFromValue(alpha, {1}, GetOutputVarDType(node));
     auto const_node = CreateConst(graph, node, {}, {}, attr);
     return CreateBaseOp(graph, node, "popart_mul",
                         {o_node->outputs[0], const_node->outputs[0]},
@@ -299,6 +299,80 @@ Node *cross_entropy2_handler(Graph *graph, Node *node) {
   }
 }
 
+Node *softmax_with_cross_entropy_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  auto soft_label = BOOST_GET_CONST(bool, op->GetAttr("soft_label"));
+  if (soft_label) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "soft_label is not supported yet in IPU"));
+  }
+  Node *new_cast = nullptr;
+  if (GetInputVarNode("Label", node)->Var()->GetDataType() ==
+      framework::proto::VarType::INT32) {
+    new_cast = GetInputVarNode("Label", node);
+  } else {
+    auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)},
+                               {}, framework::proto::VarType::INT32);
+    new_cast = new_cast->outputs[0];
+  }
+  auto softmax_node = CreateSoftmaxOpset11(
+      graph, node, {GetInputVarNode("Logits", node)}, {}, axis);
+
+  auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
+  if (label_shape_[label_shape_.size() - 1] != 1) {
+    auto log = CreateBaseOp(graph, node, "popart_log",
+                            {softmax_node->outputs[0]}, {}, {});
+    // softmax_with_cross_entropy is split to several ops in python.
+    // reduction is not needed here.
+    return CreateBaseOp(
+        graph, node, "popart_nllloss_v2", {log->outputs[0], new_cast},
+        {GetOutputVarNode("Loss", node)},
+        {
+            {"reduction", 2},  // popart::ReductionType::NoReduction
+            {"ignoreIndex", ignoreIndex},
+            {"inputIsLogProbability", true},
+        });
+  } else {
+    std::vector<int64_t> new_shape_{label_shape_[0]};
+    auto const_before_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", new_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_before_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {new_cast, const_before_loss->outputs[0]}, {}, {});
+
+    auto log = CreateBaseOp(graph, node, "popart_log",
+                            {softmax_node->outputs[0]}, {}, {});
+    auto nllloss = CreateBaseOp(
+        graph, node, "popart_nllloss_v2",
+        {log->outputs[0], reshape_before_loss->outputs[0]}, {},
+        {
+            {"reduction", 2},  // popart::ReductionType::NoReduction
+            {"ignoreIndex", ignoreIndex},
+            {"inputIsLogProbability", true},
+        });
+
+    auto const_after_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", label_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_after_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {nllloss->outputs[0], const_after_loss->outputs[0]},
+                     {GetOutputVarNode("Loss", node)}, {});
+    return reshape_after_loss;
+  }
+}
+
 Node *cumsum_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto exclusive = BOOST_GET_CONST(bool, op->GetAttr("exclusive"));
@@ -378,6 +452,8 @@ REGISTER_HANDLER(matmul, matmul_handler);
 REGISTER_HANDLER(sum, sum_handler);
 REGISTER_HANDLER(softmax, softmax_handler);
 REGISTER_HANDLER(scale, scale_handler);
+REGISTER_HANDLER(softmax_with_cross_entropy,
+                 softmax_with_cross_entropy_handler);
 REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
 REGISTER_HANDLER(cumsum, cumsum_handler);
 REGISTER_HANDLER(matmul_v2, matmul_v2_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index a08fbaa26d9ed..2e9913f58efbb 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -299,7 +299,7 @@ Node *dropout_handler(Graph *graph, Node *node) {
           CreateConst(graph, node, {}, {},
                       {{"value", std::vector<float>{1 - dropout_prob_}},
                        {"dims", std::vector<int64_t>{1}},
-                       {"dtype", GetOutputVarDtype(node)}});
+                       {"dtype", GetOutputVarDType(node)}});
       return CreateBaseOp(graph, node, "popart_mul",
                           {GetInputVarNode("X", node), scale->outputs[0]},
                           {GetOutputVarNode("Out", node)}, {});
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
index 0339097d58790..0525bb66f1618 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -124,7 +124,7 @@ Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
 
 Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
                  const std::vector<Node *> &outputs, const int otype) {
-  auto to = VarType2PopStr(otype);
+  auto to = VarType2PopartStr(static_cast<VarType::Type>(otype));
   return CreateBaseOp(graph, node, "popart_cast", inputs, outputs,
                       {{"to", to}});
 }
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
index de3788e437a42..f096beb9c4d77 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
@@ -17,8 +17,8 @@
 #include "paddle/fluid/platform/device/ipu/ipu_names.h"
 #include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
 
-using paddle::framework::AttributeMap;
-using paddle::framework::Attribute;
+using AttributeMap = paddle::framework::AttributeMap;
+using Attribute = paddle::framework::Attribute;
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 55c25bce15931..00926ee7a0b25 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -23,12 +23,14 @@ namespace {
 
 Node *fill_constant_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  if (!op->Input("ShapeTensor").empty()) {
+  auto op_inputs = op->Inputs();
+  if (op_inputs.find("ShapeTensor") != op_inputs.end() &&
+      !op->Input("ShapeTensor").empty()) {
     PADDLE_THROW(
         platform::errors::Unimplemented("op fill_constant with ShapeTensor"));
   }
   auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dtype = VarType2OnnxDType(static_cast<VarType::Type>(dtype_));
   auto dims = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr("shape"));
   auto value_ = BOOST_GET_CONST(float, op->GetAttr("value"));
   size_t size = 1;
@@ -37,19 +39,20 @@ Node *fill_constant_handler(Graph *graph, Node *node) {
   }
   Attribute value;
   switch (dtype_) {
-    case framework::proto::VarType::FP32:
+    case VarType::FP16:
+    case VarType::FP32:
       value = std::vector<float>(size, value_);
       break;
-    case framework::proto::VarType::FP64:
+    case VarType::FP64:
       value = std::vector<double>(size, value_);
       break;
-    case framework::proto::VarType::INT32:
+    case VarType::INT32:
       value = std::vector<int>(size, value_);
       break;
-    case framework::proto::VarType::INT64:
+    case VarType::INT64:
       value = std::vector<int64_t>(size, value_);
       break;
-    case framework::proto::VarType::BOOL:
+    case VarType::BOOL:
       value = std::vector<bool>(size, value_);
       break;
     default:
@@ -66,7 +69,7 @@ Node *gaussian_random_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto shape = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr("shape"));
   auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dtype = VarType2OnnxDType(static_cast<VarType::Type>(dtype_));
   auto mean = BOOST_GET_CONST(float, op->GetAttr("mean"));
   auto scale = BOOST_GET_CONST(float, op->GetAttr("std"));
   // seed not work
@@ -86,7 +89,7 @@ Node *uniform_random_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto shape = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr("shape"));
   auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dtype = VarType2OnnxDType(static_cast<VarType::Type>(dtype_));
   auto high = BOOST_GET_CONST(float, op->GetAttr("max"));
   auto low = BOOST_GET_CONST(float, op->GetAttr("min"));
   // seed not work
@@ -172,9 +175,21 @@ Node *squeeze_handler(Graph *graph, Node *node) {
 Node *cast_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto otype = BOOST_GET_CONST(int, op->GetAttr("out_dtype"));
-  auto new_node_cast =
-      CreateCast(graph, node, node->inputs, node->outputs, otype);
-  return new_node_cast;
+  auto new_node = CreateCast(graph, node, node->inputs, node->outputs, otype);
+  // Cast op created in mixed-precison has no pipline attrs
+  auto &prev_nodes = node->inputs.front()->inputs;
+  if (!prev_nodes.empty()) {
+    auto *prev_op = prev_nodes.front()->Op();
+    if (!new_node->Op()->HasAttr(sIpuIndexAttr) &&
+        prev_op->HasAttr(sIpuIndexAttr)) {
+      CopyOpAttr(sIpuIndexAttr, prev_op, new_node->Op());
+    }
+    if (!new_node->Op()->HasAttr(sIpuStageAttr) &&
+        prev_op->HasAttr(sIpuStageAttr)) {
+      CopyOpAttr(sIpuStageAttr, prev_op, new_node->Op());
+    }
+  }
+  return new_node;
 }
 
 Node *lookup_table_op_handler(Graph *graph, Node *node,
@@ -192,7 +207,7 @@ Node *lookup_table_op_handler(Graph *graph, Node *node,
     auto concat_const =
         CreateConst(graph, node, {}, {}, {{"value", const_value_},
                                           {"dims", const_shape_},
-                                          {"dtype", GetOutputVarDtype(node)}});
+                                          {"dtype", GetOutputVarDType(node)}});
     auto axes =
         CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{0}},
                                           {"dims", std::vector<int64_t>{1}},
@@ -397,7 +412,7 @@ Node *expand_handler(Graph *graph, Node *node) {
     // cast to int64
     expand_times =
         CreateCast(graph, node, {GetInputVarNode("ExpandTimes", node)}, {},
-                   framework::proto::VarType::INT64);
+                   VarType::INT64);
   } else {
     auto expand_times_i32 =
         BOOST_GET_CONST(std::vector<int>, op->GetAttr("expand_times"));
@@ -423,27 +438,28 @@ Node *assign_handler(Graph *graph, Node *node) {
 Node *assign_value_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dtype = VarType2OnnxDType(static_cast<VarType::Type>(dtype_));
   auto dims_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("shape"));
   std::vector<int64_t> dims(dims_.begin(), dims_.end());
   Attribute values;
   std::string value_name;
   switch (dtype_) {
-    case framework::proto::VarType::BOOL: {
+    case VarType::BOOL: {
       value_name = "bool_values";
       auto vec_int = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
       std::vector<bool> vec_bool(vec_int.begin(), vec_int.end());
       values = vec_bool;
     } break;
-    case framework::proto::VarType::INT32:
+    case VarType::INT32:
       value_name = "int32_values";
       values = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
       break;
-    case framework::proto::VarType::FP32:
+    case VarType::FP16:
+    case VarType::FP32:
       value_name = "fp32_values";
       values = BOOST_GET_CONST(std::vector<float>, op->GetAttr(value_name));
       break;
-    case framework::proto::VarType::INT64:
+    case VarType::INT64:
       value_name = "int64_values";
       values = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr(value_name));
       break;
@@ -463,39 +479,40 @@ Node *fill_any_like_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto value = BOOST_GET_CONST(float, op->GetAttr("value"));
   auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
-  auto dtype = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto x_dtype = static_cast<framework::proto::VarType::Type>(dtype);
+  auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
+  auto dtype = static_cast<VarType::Type>(dtype_);
   size_t size = 1;
   for (auto &dim : x_shape) {
     size *= dim;
   }
 
   Attribute out_value;
-  switch (x_dtype) {
-    case framework::proto::VarType::FP32:
+  switch (dtype) {
+    case VarType::FP16:
+    case VarType::FP32:
       out_value = std::vector<float>(size, value);
       break;
-    case framework::proto::VarType::FP64:
+    case VarType::FP64:
       out_value = std::vector<double>(size, value);
       break;
-    case framework::proto::VarType::INT32:
+    case VarType::INT32:
       out_value = std::vector<int>(size, value);
       break;
-    case framework::proto::VarType::INT64:
+    case VarType::INT64:
       out_value = std::vector<int64_t>(size, value);
       break;
-    case framework::proto::VarType::BOOL:
+    case VarType::BOOL:
       out_value = std::vector<int64_t>(size, value);
       break;
     default:
       PADDLE_THROW(
-          platform::errors::Unimplemented("fill_any_like dtype: %d", x_dtype));
+          platform::errors::Unimplemented("fill_any_like dtype: %d", dtype));
   }
   return CreateConst(graph, node, node->inputs, node->outputs,
                      AttributeMap{
                          {"value", out_value},
                          {"dims", x_shape},
-                         {"dtype", VarType2OnnxDtype(dtype)},
+                         {"dtype", VarType2OnnxDType(dtype)},
                      });
 }
 
@@ -538,8 +555,7 @@ Node *one_hot_v2_handler(Graph *graph, Node *node) {
                                           {"dims", std::vector<int64_t>{1}},
                                           {"dtype", ONNXDataType::INT32}});
     Node *value_tensor = nullptr;
-    if (GetOutputVarNode("Out", node)->Var()->GetDataType() ==
-        framework::proto::VarType::FP16) {
+    if (GetOutputVarNode("Out", node)->Var()->GetDataType() == VarType::FP16) {
       value_tensor =
           CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
                                             {"dims", std::vector<int64_t>{2}},
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index 6a58f7890f9fa..2e960c1c0dd9c 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -54,7 +54,10 @@ std::vector<int> GetXPUSelectedDevices() {
 
 void MemcpySyncH2D(void* dst, const void* src, size_t count,
                    const platform::XPUPlace& dst_place) {
-  phi::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetByPlace(dst_place);
+  dev_ctx->Wait();
+  phi::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place, *dev_ctx);
 }
 
 void MemcpySyncD2H(void* dst, const void* src, size_t count,
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index 99a1eb97de50a..43c9e63ac194b 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -42,6 +42,8 @@ XPUOpMap& get_kp_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_floordiv",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"elementwise_pow",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       // activation op
       {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -105,6 +107,8 @@ XPUOpMap& get_kp_ops() {
       {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_all", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
       {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
+      {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc
index ea7c502e3e681..998437997547b 100644
--- a/paddle/fluid/platform/dynload/cusparse.cc
+++ b/paddle/fluid/platform/dynload/cusparse.cc
@@ -24,13 +24,14 @@ namespace dynload {
 CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 #endif
 
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
-CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
-#endif
-
 #ifdef CUSPARSE_ROUTINE_EACH_11020
 CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
 #endif
+
+#ifdef CUSPARSE_ROUTINE_EACH_R2
+CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index c7a6bdc3cefae..772a7750fe90d 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -106,9 +106,6 @@ namespace phi {
 class ErrorSummary;
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-DECLARE_int64(gpu_allocator_retry_time);
-#endif
 DECLARE_int32(call_stack_level);
 
 namespace paddle {
@@ -539,7 +536,7 @@ inline void retry_sleep(unsigned milliseconds) {
         ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);    \
+      paddle::platform::retry_sleep(10000);                             \
       __cond__ = (COND);                                                \
       ++retry_count;                                                    \
     }                                                                   \
@@ -727,7 +724,7 @@ inline void retry_sleep(unsigned millisecond) {
         ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);  \
+      ::paddle::platform::retry_sleep(10000);                           \
       __cond__ = (COND);                                                \
       ++retry_count;                                                    \
     }                                                                   \
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index ab8bf0529dcfc..6636fc8aca51d 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -258,13 +258,13 @@ void BindDistributed(py::module *m) {
 #else
                     const platform::CUDAPlace &,
 #endif
-                    int, int, int, int, int, bool, std::string>(),
+                    int, int, int, int, int, bool, std::string, int, int>(),
            py::arg("store"), py::arg("rank"), py::arg("world_size"),
            py::arg("place"), py::arg("gid") = 0, py::arg("local_rank") = 0,
            py::arg("local_size") = 1, py::arg("gloo_rank") = 0,
            py::arg("gloo_size") = 1, py::arg("with_switch") = false,
-           py::arg("switch_endpoint") = "",
-           py::call_guard<py::gil_scoped_release>());
+           py::arg("switch_endpoint") = "", py::arg("src_rank") = "",
+           py::arg("dst_rank") = "", py::call_guard<py::gil_scoped_release>());
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 4d7b50943d084..ac33eb2359c8c 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -119,8 +119,7 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
   auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
   egr::Backward(tensors, grad_tensors,
                 CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -159,8 +158,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
       egr::EagerUtils::autograd_meta(&(src))->StopGradient());
   egr::EagerUtils::autograd_meta(&dst)->SetPersistable(
       egr::EagerUtils::autograd_meta(&(src))->Persistable());
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -406,12 +404,9 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
 
       if (slot_map[0].find(i) != slot_map[0].end()) {
         grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]);
-        grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]);
       } else {
         grad_node->SetGradOutMeta(in_tensors,
                                   ins_auto_grad_metas.size() - 1 - no_grad_cnt);
-        grad_node->AddEdges(&ins_auto_grad_metas[i],
-                            ins_auto_grad_metas.size() - 1 - no_grad_cnt);
         no_grad_cnt++;
       }
     }
@@ -458,8 +453,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     }
     grad_node->SetAttrs(attrs);
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -691,8 +685,7 @@ static PyObject* eager_api_async_read(PyObject* self, PyObject* args,
   cudaMemcpyAsync(dst_data + (numel * size), buffer_tensor->data<float>(),
                   index_tensor.numel() * size * sizeof(float),
                   cudaMemcpyHostToDevice, stream);
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -774,8 +767,7 @@ static PyObject* eager_api_async_write(PyObject* self, PyObject* args,
                     cudaMemcpyDeviceToHost, stream);
     src_offset += c;
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index e6bd1c0b52682..d3393b7cb57ac 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -267,8 +267,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Tensor.numpy() only support cpu tensor."));
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
 
   return array;
@@ -335,8 +334,7 @@ static PyObject* tensor_method_numpy_for_string_tensor(TensorObject* self,
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "StringTensor.numpy() only support cpu tensor."));
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -405,8 +403,8 @@ static PyObject* tensor_method_reconstruct_from_(TensorObject* self,
 
   VLOG(6) << "Finished Reconstructing Tensor from" << src_tensor.name()
           << " to " << self->tensor.name();
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -436,8 +434,8 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
 
   VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -453,8 +451,8 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args,
     }
     egr::egr_utils_api::RetainGradForTensor(self->tensor);
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -505,8 +503,8 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
     }
   }
 
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -535,8 +533,8 @@ static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args,
     }
   }
 
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -559,8 +557,8 @@ static PyObject* tensor__share_buffer_to(TensorObject* self, PyObject* args,
       static_cast<paddle::framework::Tensor*>(dst_ptr->impl().get());
   dst_tensor->ShareBufferWith(*src_tensor);
   dst_tensor->ShareDataTypeWith(*src_tensor);
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -600,8 +598,8 @@ static PyObject* tensor__share_underline_tensor_to(TensorObject* self,
                         "src tensor before share_buffer_with to other.",
                         self->tensor.name()));
   src_ptr->set_impl(self->tensor.impl());
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -656,8 +654,7 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
                                                     PyObject* kwargs) {
   EAGER_TRY
   if (!self->tensor.defined()) {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   if (self->tensor.is_dense_tensor()) {
     auto* tensor =
@@ -665,8 +662,7 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
     VLOG(6) << "tensor: " << tensor->IsInitialized();
     return ToPyObject(tensor);
   } else {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -676,16 +672,14 @@ static PyObject* tensor_method_get_underline_selected_rows(TensorObject* self,
                                                            PyObject* kwargs) {
   EAGER_TRY
   if (!self->tensor.defined()) {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   if (self->tensor.is_selected_rows()) {
     auto* selected_rows =
         static_cast<phi::SelectedRows*>(self->tensor.impl().get());
     return ToPyObject(selected_rows);
   } else {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -1110,8 +1104,8 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
                            false);
     }
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1202,8 +1196,8 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
   accumulation_grad_node->RegisterReduceHook(
       std::make_shared<PyTensorVoidHook>(hook_func));
 
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1218,7 +1212,8 @@ static PyObject* tensor__set_grad_type(TensorObject* self, PyObject* args,
   } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
     grad_tensor->set_impl(std::make_shared<phi::SelectedRows>());
   }
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1226,7 +1221,8 @@ static PyObject* tensor__clear(TensorObject* self, PyObject* args,
                                PyObject* kwargs) {
   EAGER_TRY
   self->tensor.reset();
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1254,8 +1250,8 @@ static PyObject* tensor__copy_gradient_from(TensorObject* self, PyObject* args,
                           "Tensor %s has not been initialized", src.name()));
     p_grad->set_impl(src.impl());
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
@@ -1396,7 +1392,7 @@ static PyObject* tensor__bump_inplace_version(TensorObject* self,
                                               PyObject* kwargs) {
   EAGER_TRY
   self->tensor.bump_inplace_version();
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1446,8 +1442,8 @@ static PyObject* tensor__reset_grad_inplace_version(TensorObject* self,
       grad->initialized()) {
     grad->reset_inplace_version(set_to_zero);
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1479,8 +1475,8 @@ static PyObject* tensor_method__share_memory(TensorObject* self, PyObject* args,
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
       "Sharing memory in Windows OS is not supported currently"));
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
 #endif
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -1522,8 +1518,7 @@ static PyObject* tensor__grad_value(TensorObject* self, PyObject* args,
                         "cleared the grad inside autograd_meta"));
 
   if (!grad->defined()) {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   if (grad->is_dense_tensor()) {
     auto* grad_tensor =
@@ -1532,8 +1527,7 @@ static PyObject* tensor__grad_value(TensorObject* self, PyObject* args,
   } else {
     PADDLE_THROW(paddle::platform::errors::Fatal(
         "this method is only supported for DenseTensor"));
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -1556,8 +1550,8 @@ static PyObject* tensor_method__uva(TensorObject* self, PyObject* args,
       static_cast<paddle::framework::LoDTensor*>(self->tensor.impl().get());
   tensor_uva(self_tensor, device_id);
 
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 #endif
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 2ac12165c1a66..b546aa2d76bcd 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -36,6 +36,11 @@
 // phi
 #include "paddle/phi/kernels/declarations.h"
 
+static std::string LegalizeVarName(const std::string& var_name) {
+  std::string ret = var_name;
+  std::replace(ret.begin(), ret.end(), '@', '_');  // replace all '-' to '_'
+  return ret;
+}
 // clang-format off
 const char* OUT_INITIALIZER_TEMPLATE =
     R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})";
@@ -185,18 +190,19 @@ std::string GenerateOpFunctionsBody(
       continue;
     }
     const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE;
-    auto input_arg =
-        paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name));
+    auto input_arg = paddle::string::Sprintf(
+        ARG_TEMPLATE, in_type, TempName(LegalizeVarName(in_name)));
     input_args += input_arg;
     input_args += ",";
     input_args_num++;
     const auto in_cast_type =
         input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
     auto dispensable = input.dispensable() ? "true" : "false";
-    ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type,
-                                            in_name, arg_idx++, dispensable);
+    ins_cast_str +=
+        paddle::string::Sprintf(in_cast_type, LegalizeVarName(in_name), op_type,
+                                in_name, arg_idx++, dispensable);
 
-    call_api_str += in_name + ", ";
+    call_api_str += LegalizeVarName(in_name) + ", ";
   }
 
   if (!input_args.empty() && input_args.back() == ',') {
@@ -224,7 +230,7 @@ std::string GenerateOpFunctionsBody(
         input_args += ",";
       }
       input_args += out_type;
-      input_args += out_name;
+      input_args += LegalizeVarName(out_name);
       input_args_num++;
 
       if (output.dispensable()) {
@@ -237,18 +243,19 @@ std::string GenerateOpFunctionsBody(
         const auto out_template = output.duplicable()
                                       ? INPUT_LIST_INITIALIZER_TEMPLATE
                                       : INPUT_INITIALIZER_TEMPLATE;
-        outs_initializer +=
-            paddle::string::Sprintf(out_template, out_name, out_name);
+        outs_initializer += paddle::string::Sprintf(out_template, out_name,
+                                                    LegalizeVarName(out_name));
         outs_initializer += ",";
       }
 
       const auto in_cast_type = output.duplicable() ? CAST_VAR_PTR_LIST_TEMPLATE
                                                     : CAST_VAR_PTR_TEMPLATE;
       auto dispensable = output.dispensable() ? "true" : "false";
-      ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
-                                              out_name, arg_idx++, dispensable);
+      ins_cast_str +=
+          paddle::string::Sprintf(in_cast_type, LegalizeVarName(out_name),
+                                  op_type, out_name, arg_idx++, dispensable);
 
-      call_api_str += out_name + ", ";
+      call_api_str += LegalizeVarName(out_name) + ", ";
     } else {
       // There are few Operators that have duplicable output, like `Out` in
       // split op. We need to specify the number of variables for the
@@ -257,7 +264,8 @@ std::string GenerateOpFunctionsBody(
         if (input_args != "") {
           input_args += ",";
         }
-        auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name);
+        auto out_num_str =
+            paddle::string::Sprintf(ARG_OUT_NUM, LegalizeVarName(out_name));
         input_args += ARG_OUT_NUM_TYPE;
         input_args += out_num_str;
         input_args_num++;
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index de66308a7baf6..7af221b9ac82e 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -52,8 +52,7 @@ PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
   } else if (self->tensor.is_selected_rows()) {
     return ToPyObject(paddle::framework::proto::VarType::SELECTED_ROWS);
   } else {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -87,8 +86,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
   if (meta && meta->Grad().initialized()) {
     return ToPyObject(meta->Grad());
   } else {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index 605056e7af2b5..47a5309d691f5 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -346,10 +346,8 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args,
         for (auto t : inputs_tensor[i]) {
           grad_node->SetGradOutMeta(*t, i);
         }
-        grad_node->AddEdges(&inputs_autograd_meta[i], i);
       } else {
         grad_node->SetGradOutMeta(*inputs_tensor[i][0], i);
-        grad_node->AddEdges(inputs_autograd_meta[i][0], i);
       }
     }
 
@@ -392,8 +390,7 @@ PyObject* pylayer_method_register_hook(PyObject* _self, PyObject* hook) {
 PyObject* tensor_properties_get_container(PyLayerObject* self, void* closure) {
   EAGER_TRY
   if (self->container == nullptr) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE;
   }
   Py_INCREF(self->container);
   return self->container;
@@ -414,8 +411,7 @@ PyObject* tensor_properties_get_non_differentiable(PyLayerObject* self,
                                                    void* closure) {
   EAGER_TRY
   if (self->non_differentiable == nullptr) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE;
   }
   Py_INCREF(self->non_differentiable);
   return self->non_differentiable;
@@ -436,8 +432,7 @@ PyObject* tensor_properties_get_dirty_tensors(PyLayerObject* self,
                                               void* closure) {
   EAGER_TRY
   if (self->dirty_tensors == nullptr) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE;
   }
   Py_INCREF(self->dirty_tensors);
   return self->dirty_tensors;
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index d07cbd5ee21a2..90d7024f7a746 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -516,8 +516,7 @@ PyObject* ToPyObject(const std::string& value) {
 PyObject* ToPyObject(const paddle::experimental::Tensor& value,
                      bool return_py_none_if_not_initialize) {
   if (return_py_none_if_not_initialize && !value.initialized()) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   PyObject* obj = nullptr;
   if (value.initialized() && value.is_string_tensor()) {
@@ -679,8 +678,7 @@ PyObject* ToPyObject(const phi::SelectedRows* value) {
 
 PyObject* ToPyObject(const void* value) {
   if (value == nullptr) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   PADDLE_THROW(
       platform::errors::Fatal("ToPyObject do not support void* with value."));
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index c4ddb34763228..5273433208d11 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -31,6 +31,10 @@ class Scope;
 }
 namespace pybind {
 
+#define RETURN_PY_NONE \
+  Py_INCREF(Py_None);  \
+  return Py_None;
+
 int TensorDtype2NumpyDtype(phi::DataType dtype);
 
 bool IsEagerTensor(PyObject* obj);
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 4a1dadd6d251c..bcf55e46edb76 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -60,11 +57,7 @@ void BindDistFleetWrapper(py::module* m) {
       .def("load_model", &FleetWrapper::LoadModel)
       .def("load_one_table", &FleetWrapper::LoadModelOneTable)
       .def("init_server", &FleetWrapper::InitServer)
-      .def("run_server",
-           (uint64_t (FleetWrapper::*)(void)) & FleetWrapper::RunServer)
-      .def("run_server", (uint64_t (FleetWrapper::*)(          // NOLINT
-                             const std::string&, uint32_t)) &  // NOLINT
-                             FleetWrapper::RunServer)
+      .def("run_server", &FleetWrapper::RunServer)
       .def("init_worker", &FleetWrapper::InitWorker)
       .def("push_dense_params", &FleetWrapper::PushDenseParamSync)
       .def("pull_dense_params", &FleetWrapper::PullDenseVarsSync)
@@ -259,6 +252,8 @@ using paddle::distributed::IndexNode;
 #ifdef PADDLE_WITH_HETERPS
 using paddle::framework::GraphGpuWrapper;
 using paddle::framework::NeighborSampleResult;
+using paddle::framework::NeighborSampleQuery;
+using paddle::framework::NodeQueryResult;
 #endif
 
 void BindIndexNode(py::module* m) {
@@ -311,24 +306,55 @@ void BindIndexWrapper(py::module* m) {
 }
 
 #ifdef PADDLE_WITH_HETERPS
+void BindNodeQueryResult(py::module* m) {
+  py::class_<NodeQueryResult>(*m, "NodeQueryResult")
+      .def(py::init<>())
+      .def("initialize", &NodeQueryResult::initialize)
+      .def("display", &NodeQueryResult::display)
+      .def("get_val", &NodeQueryResult::get_val)
+      .def("get_len", &NodeQueryResult::get_len);
+}
+void BindNeighborSampleQuery(py::module* m) {
+  py::class_<NeighborSampleQuery>(*m, "NeighborSampleQuery")
+      .def(py::init<>())
+      .def("initialize", &NeighborSampleQuery::initialize)
+      .def("display", &NeighborSampleQuery::display);
+}
+
 void BindNeighborSampleResult(py::module* m) {
   py::class_<NeighborSampleResult>(*m, "NeighborSampleResult")
       .def(py::init<>())
-      .def("initialize", &NeighborSampleResult::initialize);
+      .def("initialize", &NeighborSampleResult::initialize)
+      .def("get_len", &NeighborSampleResult::get_len)
+      .def("get_val", &NeighborSampleResult::get_actual_val)
+      .def("display", &NeighborSampleResult::display);
 }
 
 void BindGraphGpuWrapper(py::module* m) {
   py::class_<GraphGpuWrapper>(*m, "GraphGpuWrapper")
+      // nit<>())
+      //.def("test", &GraphGpuWrapper::test)
+      //.def(py::init([]() { return framework::GraphGpuWrapper::GetInstance();
+      //}))
       .def(py::init<>())
-      .def("test", &GraphGpuWrapper::test)
-      .def("initialize", &GraphGpuWrapper::initialize)
+      .def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3)
       .def("graph_neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample)
       .def("set_device", &GraphGpuWrapper::set_device)
       .def("init_service", &GraphGpuWrapper::init_service)
       .def("set_up_types", &GraphGpuWrapper::set_up_types)
+      .def("query_node_list", &GraphGpuWrapper::query_node_list)
       .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf)
       .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
       .def("upload_batch", &GraphGpuWrapper::upload_batch)
+      .def("get_all_id", &GraphGpuWrapper::get_all_id)
+      .def("load_next_partition", &GraphGpuWrapper::load_next_partition)
+      .def("make_partitions", &GraphGpuWrapper::make_partitions)
+      .def("make_complementary_graph",
+           &GraphGpuWrapper::make_complementary_graph)
+      .def("set_search_level", &GraphGpuWrapper::set_search_level)
+      .def("init_search_level", &GraphGpuWrapper::init_search_level)
+      .def("get_partition_num", &GraphGpuWrapper::get_partition_num)
+      .def("get_partition", &GraphGpuWrapper::get_partition)
       .def("load_node_file", &GraphGpuWrapper::load_node_file);
 }
 #endif
diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h
index 81ed25913ba1a..a47aec749bda5 100644
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
@@ -39,6 +39,8 @@ void BindIndexSampler(py::module* m);
 #ifdef PADDLE_WITH_HETERPS
 void BindNeighborSampleResult(py::module* m);
 void BindGraphGpuWrapper(py::module* m);
+void BindNodeQueryResult(py::module* m);
+void BindNeighborSampleQuery(py::module* m);
 #endif
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 145c116fa14c3..1da0831fc6323 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -2028,35 +2028,35 @@ void BindImperative(py::module *m_ptr) {
                  *(imperative::AmpOperators::Instance().GetMutableAllowOps()),
                  *(imperative::AmpOperators::Instance().GetMutableBlockOps()));
            })
-      .def(
-          "_get_kernel_signature",
-          [](imperative::Tracer &self, const std::string &type,
-             const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-             framework::AttributeMap attrs) {
-            // TODO(xiongkun): move this function outside of tracer.
-            auto ins_map = ConvertToNameTensorMap(ins);
-            auto outs_map = ConvertToNameTensorMap(outs);
-            {
-              auto input_to_vector =
-                  [](paddle::SmallVector<const char *> &vec) {
-                    return std::vector<std::string>(vec.begin(), vec.end());
-                  };
-              auto output_to_vector =
-                  [](paddle::SmallVector<const char *> &vec) {
-                    return std::vector<std::string>(vec.begin(), vec.end());
-                  };
-              auto attr_to_vector = [](paddle::SmallVector<const char *> &vec) {
-                return std::vector<std::string>(vec.begin(), vec.end());
-              };
-              auto ret = self.GetExpectedKernelSignature(type, ins_map,
-                                                         outs_map, attrs);
-              auto kernelsig_ins = input_to_vector(ret.input_names);
-              auto kernelsig_attrs = attr_to_vector(ret.attr_names);
-              auto kernelsig_outs = output_to_vector(ret.output_names);
-              return std::make_tuple(kernelsig_ins, kernelsig_attrs,
-                                     kernelsig_outs);
-            }
-          })
+      .def("_get_kernel_signature",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs) {
+             // TODO(xiongkun): move this function outside of tracer.
+             auto ins_map = ConvertToNameTensorMap(ins);
+             auto outs_map = ConvertToNameTensorMap(outs);
+             {
+               auto input_to_vector =
+                   [](paddle::small_vector<const char *> &vec) {
+                     return std::vector<std::string>(vec.begin(), vec.end());
+                   };
+               auto output_to_vector =
+                   [](paddle::small_vector<const char *> &vec) {
+                     return std::vector<std::string>(vec.begin(), vec.end());
+                   };
+               auto attr_to_vector =
+                   [](paddle::small_vector<const char *> &vec) {
+                     return std::vector<std::string>(vec.begin(), vec.end());
+                   };
+               auto ret = self.GetExpectedKernelSignature(type, ins_map,
+                                                          outs_map, attrs);
+               auto kernelsig_ins = input_to_vector(ret.input_names);
+               auto kernelsig_attrs = attr_to_vector(ret.attr_names);
+               auto kernelsig_outs = output_to_vector(ret.output_names);
+               return std::make_tuple(kernelsig_ins, kernelsig_attrs,
+                                      kernelsig_outs);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 91d5d39622714..1bbe6808b2846 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -765,10 +765,7 @@ void BindMkldnnQuantizerConfig(py::module *m) {
              return;
            })
       .def("set_quant_batch_size", &MkldnnQuantizerConfig::SetWarmupBatchSize)
-      .def(
-          "set_enabled_op_types",
-          (void (MkldnnQuantizerConfig::*)(std::unordered_set<std::string> &)) &
-              MkldnnQuantizerConfig::SetEnabledOpTypes);
+      .def("set_enabled_op_types", &MkldnnQuantizerConfig::SetEnabledOpTypes);
 }
 #endif
 
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 7b9379df6be2c..5a5650e75665c 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -177,7 +177,7 @@ static inline void HandleViewBetweenInputAndOutput(
   }
 }
 
-PyObject* MakeReturnPyObject(
+static inline PyObject* MakeReturnPyObject(
     const std::shared_ptr<paddle::imperative::VarBase>& out) {
   return ::pybind11::detail::type_caster_base<imperative::VarBase>::cast_holder(
              ::pybind11::detail::holder_helper<
@@ -186,7 +186,7 @@ PyObject* MakeReturnPyObject(
       .ptr();
 }
 
-PyObject* MakeReturnPyObject(
+static inline PyObject* MakeReturnPyObject(
     const std::vector<std::shared_ptr<imperative::VarBase>>& out) {
   PyObject* result = PyList_New((Py_ssize_t)out.size());
 
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 5eed63d0800b3..0e9c08cff2859 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -282,6 +282,7 @@ std::vector<int> CastPyArg2Ints(PyObject* obj, const std::string& op_type,
   std::vector<int> value;
   if (PyList_Check(obj)) {
     Py_ssize_t len = PyList_Size(obj);
+    value.reserve(len);
     PyObject* item = nullptr;
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyList_GetItem(obj, i);
@@ -298,6 +299,7 @@ std::vector<int> CastPyArg2Ints(PyObject* obj, const std::string& op_type,
     }
   } else if (PyTuple_Check(obj)) {
     Py_ssize_t len = PyTuple_Size(obj);
+    value.reserve(len);
     PyObject* item = nullptr;
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyTuple_GetItem(obj, i);
@@ -314,6 +316,7 @@ std::vector<int> CastPyArg2Ints(PyObject* obj, const std::string& op_type,
     }
   } else if (PySequence_Check(obj)) {
     Py_ssize_t len = PySequence_Size(obj);
+    value.reserve(len);
     PyObject* item = nullptr;
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PySequence_GetItem(obj, i);
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 9d5bcfac494cb..a905c5befc2b0 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -35,6 +35,12 @@
 // phi
 #include "paddle/phi/kernels/declarations.h"
 
+static std::string LegalizeVarName(const std::string& var_name) {
+  std::string ret = var_name;
+  std::replace(ret.begin(), ret.end(), '@', '_');  // replace all '-' to '_'
+  return ret;
+}
+
 // NOTE(pangyoki): Inplace OP with duplicable input.
 // The set includes inplace ops that have duplicable input.
 // The first Varbase in input needs to be specified for the inplace strategy
@@ -81,13 +87,13 @@ const char* OUT_VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
 const char* OUT_VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
 
 const char* CAST_VAR_TEMPLATE = R"(
-    auto %s = GetVarBaseFromArgs("%s", "%s", args, %d, %s);)";
+    auto %s = GetVarBaseFromArgs(op_type, "%s", args, %d, %s);)";
 
 const char* CAST_VAR_LIST_TEMPLATE = R"(
-    auto %s = GetVarBaseListFromArgs("%s", "%s", args, %d, %s);)";
+    auto %s = GetVarBaseListFromArgs(op_type, "%s", args, %d, %s);)";
 
 const char* CAST_SIZE_T_TEMPLATE = R"(
-    auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)";
+    auto %s = GetUnsignedLongFromArgs(op_type, "%s", args, %d, %s);)";
 
 const char* ARG_TEMPLATE = R"(const %s& %s)";
 
@@ -126,16 +132,17 @@ static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
   PyThreadState *tstate = nullptr;
   try
   {
+    std::string op_type = "%s";
     platform::RecordEvent op_type_record_event("%s pybind_imperative_func");
     %s
     framework::AttributeMap attrs;
-    ConstructAttrMapFromPyArgs("%s", args, %d, PyTuple_GET_SIZE(args) , attrs);
+    ConstructAttrMapFromPyArgs(op_type, args, %d, PyTuple_GET_SIZE(args) , attrs);
     tstate = PyEval_SaveThread();
     %s
     imperative::NameVarBaseMap outs = %s;
     imperative::NameVarBaseMap ins = %s;
     %s
-    imperative::GetCurrentTracer()->TraceOp("%s", ins, outs, attrs, {%s});
+    imperative::GetCurrentTracer()->TraceOp(op_type, ins, outs, attrs, {%s});
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
     %s
@@ -200,28 +207,31 @@ std::string GenerateOpFunctionsBody(
       continue;
     }
     const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE;
-    auto input_arg =
-        paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name));
+    auto input_arg = paddle::string::Sprintf(
+        ARG_TEMPLATE, in_type, LegalizeVarName(TempName(in_name)));
     input_args += input_arg;
     input_args += ",";
     input_args_num++;
     const auto in_cast_type =
         input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
     auto dispensable = input.dispensable() ? "true" : "false";
-    ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type,
-                                            in_name, arg_idx++, dispensable);
+    ins_cast_str +=
+        paddle::string::Sprintf(in_cast_type, LegalizeVarName(in_name), in_name,
+                                arg_idx++, dispensable);
 
     if (input.dispensable()) {
       const auto in_template = input.duplicable()
                                    ? INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
                                    : INPUT_INITIALIZER_TEMPLATE_WITH_NULL;
       ins_initializer_with_null +=
-          paddle::string::Sprintf(in_template, in_name, in_name, in_name);
+          paddle::string::Sprintf(in_template, LegalizeVarName(in_name),
+                                  in_name, LegalizeVarName(in_name));
     } else {
       const auto in_template = input.duplicable()
                                    ? INPUT_LIST_INITIALIZER_TEMPLATE
                                    : INPUT_INITIALIZER_TEMPLATE;
-      ins_initializer += paddle::string::Sprintf(in_template, in_name, in_name);
+      ins_initializer += paddle::string::Sprintf(in_template, in_name,
+                                                 LegalizeVarName(in_name));
       ins_initializer += ",";
     }
   }
@@ -258,7 +268,7 @@ std::string GenerateOpFunctionsBody(
         input_args += ",";
       }
       input_args += out_type;
-      input_args += out_name;
+      input_args += LegalizeVarName(out_name);
       input_args_num++;
 
       if (output.dispensable()) {
@@ -271,16 +281,17 @@ std::string GenerateOpFunctionsBody(
         const auto out_template = output.duplicable()
                                       ? INPUT_LIST_INITIALIZER_TEMPLATE
                                       : INPUT_INITIALIZER_TEMPLATE;
-        outs_initializer +=
-            paddle::string::Sprintf(out_template, out_name, out_name);
+        outs_initializer += paddle::string::Sprintf(out_template, out_name,
+                                                    LegalizeVarName(out_name));
         outs_initializer += ",";
       }
 
       const auto in_cast_type =
           output.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
       auto dispensable = output.dispensable() ? "true" : "false";
-      ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
-                                              out_name, arg_idx++, dispensable);
+      ins_cast_str +=
+          paddle::string::Sprintf(in_cast_type, LegalizeVarName(out_name),
+                                  out_name, arg_idx++, dispensable);
     } else if (use_inplace_strategy && inplace_map.count(out_name)) {
       PADDLE_ENFORCE_NE(
           inplace_map[out_name], "",
@@ -306,11 +317,13 @@ std::string GenerateOpFunctionsBody(
       // Leaf Var that doesn't stop gradient can't use inplace strategy.
       // Increase inplace_version.
       inplace_strategy_str += paddle::string::Sprintf(
-          INPLACE_STRATEGY_TEMPLATE, inplace_input_name, inplace_input_name,
-          INPLACE_LEAF_ERROR_MESSAGE, inplace_input_name, inplace_input_name,
-          inplace_input_name);
-      outs_initializer +=
-          paddle::string::Sprintf(out_template, out_name, inplace_input_name);
+          INPLACE_STRATEGY_TEMPLATE, LegalizeVarName(inplace_input_name),
+          LegalizeVarName(inplace_input_name), INPLACE_LEAF_ERROR_MESSAGE,
+          LegalizeVarName(inplace_input_name),
+          LegalizeVarName(inplace_input_name),
+          LegalizeVarName(inplace_input_name));
+      outs_initializer += paddle::string::Sprintf(
+          out_template, out_name, LegalizeVarName(inplace_input_name));
       outs_initializer += ",";
     } else {
       // There are few Operators that have duplicable output, like `Out` in
@@ -320,7 +333,8 @@ std::string GenerateOpFunctionsBody(
         if (input_args != "") {
           input_args += ",";
         }
-        auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name);
+        auto out_num_str =
+            paddle::string::Sprintf(ARG_OUT_NUM, LegalizeVarName(out_name));
         input_args += ARG_OUT_NUM_TYPE;
         input_args += out_num_str;
         input_args_num++;
@@ -329,7 +343,7 @@ std::string GenerateOpFunctionsBody(
 
         auto dispensable = output.dispensable() ? "true" : "false";
         ins_cast_str +=
-            paddle::string::Sprintf(CAST_SIZE_T_TEMPLATE, out_num_str, op_type,
+            paddle::string::Sprintf(CAST_SIZE_T_TEMPLATE, out_num_str,
                                     out_num_str, arg_idx++, dispensable);
       } else {
         outs_initializer +=
@@ -358,7 +372,7 @@ std::string GenerateOpFunctionsBody(
         viwe_input_name, viwe_output_name);
   }
   if (outs_num == 0) {
-    return_str = "Py_INCREF(Py_None);\n    return Py_None;";
+    return_str = "RETURN_PY_NONE";
   } else if (outs_num == 1) {
     return_str = "return MakeReturnPyObject(" + return_str + ");";
   } else {
@@ -375,11 +389,11 @@ std::string GenerateOpFunctionsBody(
 
   // generate op funtcion body
   auto op_function_str = paddle::string::Sprintf(
-      OP_FUNCTION_TEMPLATE, func_name, op_type, ins_cast_str, op_type,
+      OP_FUNCTION_TEMPLATE, func_name, op_type, op_type, ins_cast_str,
       input_args_num, inplace_strategy_str, outs_initializer, ins_initializer,
       ins_initializer_with_null + outs_initializer_with_null +
           view_strategy_str,
-      op_type, inplace_mapping_str, return_str);
+      inplace_mapping_str, return_str);
 
   return op_function_str;
 }
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 7b128bd3b0e4d..2b849968c76f9 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -32,6 +32,10 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"fused_attention",
      {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "SrcMask",
       "OutLinearW", "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+    {"fused_multi_transformer",
+     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "TimeStep",
+      "SrcMask", "OutLinearW", "OutLinearBias", "FFNLnScale", "FFNLnBias",
+      "FFN1Weight", "FFN1Bias", "FFN2Weight", "FFN2Bias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
@@ -176,6 +180,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
+    {"fused_multi_transformer", {"CacheKVOut", "Out"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -253,6 +258,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"assign_value", {"Out"}},
     {"split", {"Out"}},
     {"concat", {"Out"}},
+    {"fused_multi_transformer", {"CacheKVOut"}},
 };
 
 // NOTE(pangyoki): Tensor View Strategy.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index dc380f83bf71b..602a0345b04fe 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1921,7 +1921,7 @@ All parameter, weight, gradient are variables in Paddle.
              Prune the backward part of a program, mostly called in
              program.clone(for_test=True).
               
-             Args:
+            Args:
                    program (ProgramDesc): The original program.
 
              Returns:
@@ -1930,6 +1930,17 @@ All parameter, weight, gradient are variables in Paddle.
                    which contains the id pair of pruned block and corresponding
                    origin block.
            )DOC");
+  m.def("get_readable_comile_key", [](const OpDesc &op_desc) {
+    auto compilation_key =
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("compilation_key"));
+    VLOG(4) << std::hash<std::string>{}(compilation_key) << " "
+            << compilation_key.size();
+    proto::ProgramDesc desc;
+    desc.ParseFromString(compilation_key);
+    auto s = desc.DebugString();
+    VLOG(4) << s;
+    return s;
+  });
   m.def("empty_var_name",
         []() { return std::string(framework::kEmptyVarName); });
   m.def("grad_var_suffix",
@@ -2195,6 +2206,7 @@ All parameter, weight, gradient are variables in Paddle.
              std::exit(-1);
 #endif
            })
+      .def("_type", &PlaceIndex<platform::CustomPlace>)
       .def("get_device_id",
            [](const platform::CustomPlace &self) { return self.GetDeviceId(); })
       .def("get_device_type",
@@ -3010,6 +3022,10 @@ All parameter, weight, gradient are variables in Paddle.
     // Only GPUs with Compute Capability >= 53 support float16
     return platform::GetGPUComputeCapability(place.device) >= 53;
   });
+  m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
+    // Only GPUs with Compute Capability >= 80 support bfloat16
+    return platform::GetGPUComputeCapability(place.device) >= 80;
+  });
 #endif
 
   m.def("set_feed_variable",
@@ -4341,7 +4357,10 @@ All parameter, weight, gradient are variables in Paddle.
              for (auto element : opt) {
                auto option_name = element.first.cast<std::string>();
                VLOG(10) << "Set option: " << option_name;
-               if (py::isinstance<py::bool_>(element.second)) {
+               if (option_name == "compilation_progress_logger") {
+                 self.SetCompilationProgressLogger(
+                     element.second.cast<py::function>());
+               } else if (py::isinstance<py::bool_>(element.second)) {
                  self.AddBoolOption(option_name, element.second.cast<bool>());
                } else if (py::isinstance<py::float_>(element.second)) {
                  self.AddDoubleOption(option_name,
@@ -4567,6 +4586,8 @@ All parameter, weight, gradient are variables in Paddle.
   BindIndexWrapper(&m);
   BindIndexSampler(&m);
 #ifdef PADDLE_WITH_HETERPS
+  BindNodeQueryResult(&m);
+  BindNeighborSampleQuery(&m);
   BindNeighborSampleResult(&m);
   BindGraphGpuWrapper(&m);
 #endif
diff --git a/paddle/fluid/pybind/pybind_boost_headers.h b/paddle/fluid/pybind/pybind_boost_headers.h
index 3eb4db175a745..be9333eb7361b 100644
--- a/paddle/fluid/pybind/pybind_boost_headers.h
+++ b/paddle/fluid/pybind/pybind_boost_headers.h
@@ -45,10 +45,28 @@ struct PYBIND11_HIDDEN paddle_variant_caster_visitor
   paddle_variant_caster_visitor(return_value_policy policy, handle parent)
       : policy(policy), parent(parent) {}
 
-  template <class T>
-  handle operator()(T const &src) const {
+  template <class T,
+            typename std::enable_if<!std::is_same<T, std::string>::value,
+                                    bool>::type* = nullptr>
+  handle operator()(T const& src) const {
     return make_caster<T>::cast(src, policy, parent);
   }
+
+  template <class T,
+            typename std::enable_if<std::is_same<T, std::string>::value,
+                                    bool>::type* = nullptr>
+  handle operator()(T const& src) const {
+    try {
+      return make_caster<T>::cast(src, policy, parent);
+    } catch (std::exception& ex) {
+      VLOG(4) << ex.what();
+      VLOG(4) << src;
+      // UnicodeDecodeError, src is not utf-8 encoded
+      // see details:
+      // https://github.com/pybind/pybind11/blob/master/docs/advanced/cast/strings.rst
+      return PYBIND11_BYTES_FROM_STRING_AND_SIZE(src.data(), src.size());
+    }
+  }
 };
 
 template <class Variant>
@@ -105,7 +123,7 @@ struct paddle_variant_caster<V<Ts...>> {
     return load_success_;
   }
 
-  static handle cast(Type const &src, return_value_policy policy,
+  static handle cast(Type const& src, return_value_policy policy,
                      handle parent) {
     paddle_variant_caster_visitor visitor(policy, parent);
     return boost::apply_visitor(visitor, src);
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index b1aa81260968f..9425a290142da 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -93,9 +93,9 @@ std::vector<PhiKernelDesc> GetCandidateKernels(
     phi_kernel_desc.input_types.clear();
     phi_kernel_desc.output_types.clear();
     phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def();
-    const paddle::SmallVector<phi::TensorArgDef, phi::kInputSmallVectorSize>&
+    const paddle::small_vector<phi::TensorArgDef, phi::kInputSmallVectorSize>&
         input_arg = args_def.input_defs();
-    const paddle::SmallVector<phi::TensorArgDef, phi::kOutputSmallVectorSize>&
+    const paddle::small_vector<phi::TensorArgDef, phi::kOutputSmallVectorSize>&
         output_arg = args_def.output_defs();
     for (auto tensor_arg : input_arg) {
       phi_kernel_desc.input_types.emplace_back(ConvertPlaceFromPhi(tensor_arg));
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
index 070867853ad3e..49fe069217ed7 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -63,6 +63,12 @@ bool ProtoArgumentMappingContext::IsDenseTensorInput(
     const std::string& name) const {
   return true;
 }
+
+bool ProtoArgumentMappingContext::IsDenseTensorInputs(
+    const std::string& name) const {
+  return true;
+}
+
 bool ProtoArgumentMappingContext::IsSelectedRowsInput(
     const std::string& name) const {
   return false;
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index 5cf2ef979076d..7cb2651ccf6a2 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -41,6 +41,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
   size_t OutputSize(const std::string& name) const override;
 
   bool IsDenseTensorInput(const std::string& name) const override;
+  bool IsDenseTensorInputs(const std::string& name) const override;
   bool IsSelectedRowsInput(const std::string& name) const override;
   bool IsDenseTensorVectorInput(const std::string& name) const override;
 
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index ae248a7bf1280..38a60ab978900 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -69,7 +69,12 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
       kernel_data_type = kernel_key.dtype();
     }
   }
+
   std::string kernel_name = "adam";
+  if (!phi::DenseTensor::classof(grad.impl().get())) {
+    kernel_name = "adam_dense_param_sparse_grad";
+  }
+
   const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, {kernel_backend, kernel_layout, kernel_data_type});
   VLOG(6) << kernel_name << " API kernel key: [" << kernel_backend << ", "
@@ -77,9 +82,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
   VLOG(6) << kernel_name << " API kernel: " << kernel;
 
   auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
-
   auto input_param = PrepareData(param, kernel.InputAt(0), {});
-  auto input_grad = PrepareData(grad, kernel.InputAt(1), {});
   auto input_lr = PrepareData(learning_rate, kernel.InputAt(2), {});
   auto input_moment1 = PrepareData(moment1, kernel.InputAt(3), {});
   auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {});
@@ -140,78 +143,155 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
   phi::MetaTensor meta_out_4(kernel_out_4);
   phi::MetaTensor meta_out_5(kernel_out_5);
 
-  phi::AdamInferMeta(MakeMetaTensor(*input_param),
-                     MakeMetaTensor(*input_grad),
-                     MakeMetaTensor(*input_lr),
-                     MakeMetaTensor(*input_moment1),
-                     MakeMetaTensor(*input_moment2),
-                     MakeMetaTensor(*input_beta1_pow),
-                     MakeMetaTensor(*input_beta2_pow),
-                     input_meta_ref_master_param,
-                     input_meta_ref_skip_update,
-                     beta1,
-                     beta2,
-                     epsilon,
-                     lazy_mode,
-                     min_row_size_to_use_multithread,
-                     multi_precision,
-                     use_global_beta_pow,
-                     &meta_out_0,
-                     &meta_out_1,
-                     &meta_out_2,
-                     &meta_out_3,
-                     &meta_out_4,
-                     &meta_out_5);
-
-  using kernel_signature = void (*)(const platform::DeviceContext&,
-                                    const phi::DenseTensor&,
-                                    const phi::DenseTensor&,
-                                    const phi::DenseTensor&,
-                                    const phi::DenseTensor&,
-                                    const phi::DenseTensor&,
-                                    const phi::DenseTensor&,
-                                    const phi::DenseTensor&,
-                                    paddle::optional<const phi::DenseTensor&>,
-                                    paddle::optional<const phi::DenseTensor&>,
-                                    const Scalar&,
-                                    const Scalar&,
-                                    const Scalar&,
-                                    bool,
-                                    int64_t,
-                                    bool,
-                                    bool,
-                                    phi::DenseTensor*,
-                                    phi::DenseTensor*,
-                                    phi::DenseTensor*,
-                                    phi::DenseTensor*,
-                                    phi::DenseTensor*,
-                                    phi::DenseTensor*);
-  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+  if (phi::DenseTensor::classof(grad.impl().get())) {
+    auto input_grad = PrepareData(grad, kernel.InputAt(1), {});
+
+    phi::AdamInferMeta(MakeMetaTensor(*input_param),
+                       MakeMetaTensor(*input_grad),
+                       MakeMetaTensor(*input_lr),
+                       MakeMetaTensor(*input_moment1),
+                       MakeMetaTensor(*input_moment2),
+                       MakeMetaTensor(*input_beta1_pow),
+                       MakeMetaTensor(*input_beta2_pow),
+                       input_meta_ref_master_param,
+                       input_meta_ref_skip_update,
+                       beta1,
+                       beta2,
+                       epsilon,
+                       lazy_mode,
+                       min_row_size_to_use_multithread,
+                       multi_precision,
+                       use_global_beta_pow,
+                       &meta_out_0,
+                       &meta_out_1,
+                       &meta_out_2,
+                       &meta_out_3,
+                       &meta_out_4,
+                       &meta_out_5);
+
+    using kernel_signature = void (*)(const platform::DeviceContext&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      paddle::optional<const phi::DenseTensor&>,
+                                      paddle::optional<const phi::DenseTensor&>,
+                                      const Scalar&,
+                                      const Scalar&,
+                                      const Scalar&,
+                                      bool,
+                                      int64_t,
+                                      bool,
+                                      bool,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 
-  (*kernel_fn)(*dev_ctx,
-               *input_param,
-               *input_grad,
-               *input_lr,
-               *input_moment1,
-               *input_moment2,
-               *input_beta1_pow,
-               *input_beta2_pow,
-               input_master_param,
-               input_skip_update,
-               beta1,
-               beta2,
-               epsilon,
-               lazy_mode,
-               min_row_size_to_use_multithread,
-               multi_precision,
-               use_global_beta_pow,
-               kernel_out_0,
-               kernel_out_1,
-               kernel_out_2,
-               kernel_out_3,
-               kernel_out_4,
-               kernel_out_5);
+    (*kernel_fn)(*dev_ctx,
+                 *input_param,
+                 *input_grad,
+                 *input_lr,
+                 *input_moment1,
+                 *input_moment2,
+                 *input_beta1_pow,
+                 *input_beta2_pow,
+                 input_master_param,
+                 input_skip_update,
+                 beta1,
+                 beta2,
+                 epsilon,
+                 lazy_mode,
+                 min_row_size_to_use_multithread,
+                 multi_precision,
+                 use_global_beta_pow,
+                 kernel_out_0,
+                 kernel_out_1,
+                 kernel_out_2,
+                 kernel_out_3,
+                 kernel_out_4,
+                 kernel_out_5);
+  } else {
+    auto input_grad = TensorToSelectedRows(grad);
+
+    phi::AdamInferMeta(MakeMetaTensor(*input_param),
+                       MakeMetaTensor(*input_grad),
+                       MakeMetaTensor(*input_lr),
+                       MakeMetaTensor(*input_moment1),
+                       MakeMetaTensor(*input_moment2),
+                       MakeMetaTensor(*input_beta1_pow),
+                       MakeMetaTensor(*input_beta2_pow),
+                       input_meta_ref_master_param,
+                       input_meta_ref_skip_update,
+                       beta1,
+                       beta2,
+                       epsilon,
+                       lazy_mode,
+                       min_row_size_to_use_multithread,
+                       multi_precision,
+                       use_global_beta_pow,
+                       &meta_out_0,
+                       &meta_out_1,
+                       &meta_out_2,
+                       &meta_out_3,
+                       &meta_out_4,
+                       &meta_out_5);
+
+    using kernel_signature = void (*)(const platform::DeviceContext&,
+                                      const phi::DenseTensor&,
+                                      const phi::SelectedRows&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      paddle::optional<const phi::DenseTensor&>,
+                                      paddle::optional<const phi::DenseTensor&>,
+                                      const Scalar&,
+                                      const Scalar&,
+                                      const Scalar&,
+                                      bool,
+                                      int64_t,
+                                      bool,
+                                      bool,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*,
+                                      phi::DenseTensor*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 
+    (*kernel_fn)(*dev_ctx,
+                 *input_param,
+                 *input_grad,
+                 *input_lr,
+                 *input_moment1,
+                 *input_moment2,
+                 *input_beta1_pow,
+                 *input_beta2_pow,
+                 input_master_param,
+                 input_skip_update,
+                 beta1,
+                 beta2,
+                 epsilon,
+                 lazy_mode,
+                 min_row_size_to_use_multithread,
+                 multi_precision,
+                 use_global_beta_pow,
+                 kernel_out_0,
+                 kernel_out_1,
+                 kernel_out_2,
+                 kernel_out_3,
+                 kernel_out_4,
+                 kernel_out_5);
+  }
   return api_output;
 }
 
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index a0fd42d769aac..fb205212ff371 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -154,7 +154,7 @@ phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
           std::make_shared<phi::SparseCsrTensor>(phi::DenseTensor(),
                                                  phi::DenseTensor(),
                                                  phi::DenseTensor(),
-                                                 phi::DDim{-1});
+                                                 phi::DDim{-1, -1});
       out->set_impl(sparse_tensor);
       return sparse_tensor.get();
     } else {
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index be545ac9ce2f7..9f2ad6c62c7cf 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -96,8 +96,7 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
 
   // TODO(chenweihang): deal with multiple diff input Tensors
   // TODO(chenweihang): add global device guard method to set backend
-  void operator()(const Tensor& x) {
-    const phi::TensorBase& tensor = *x.impl();
+  inline void AssignKernelKeySet(const phi::TensorBase& tensor) {
     key_set.backend_set =
         key_set.backend_set | detail::GetTensorBackendSet(tensor);
     // TODO(chenweihang): select multi layout and dtype
@@ -110,6 +109,8 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
     }
   }
 
+  void operator()(const Tensor& x) { AssignKernelKeySet(*x.impl()); }
+
   void operator()(const std::vector<Tensor>& x) {
     const phi::TensorBase& tensor = *x.at(0).impl();
     key_set.backend_set =
@@ -119,6 +120,13 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
     key_set.dtype = tensor.dtype();
   }
 
+  void operator()(const paddle::optional<const Tensor&> x) {
+    if (x.get_ptr() != nullptr) {
+      const phi::TensorBase& tensor = *(x.get_ptr()->impl());
+      AssignKernelKeySet(tensor);
+    }
+  }
+
   // skip other type args, these args don't used in kernel selection
   template <typename T>
   void operator()(const T& x) {
diff --git a/paddle/phi/backends/dynload/cusparse.cc b/paddle/phi/backends/dynload/cusparse.cc
index a37fbf35a26e8..326645726bbed 100644
--- a/paddle/phi/backends/dynload/cusparse.cc
+++ b/paddle/phi/backends/dynload/cusparse.cc
@@ -26,12 +26,13 @@ void *cusparse_dso_handle;
 CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 #endif
 
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
-CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
-#endif
-
 #ifdef CUSPARSE_ROUTINE_EACH_11020
 CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
 #endif
+
+#ifdef CUSPARSE_ROUTINE_EACH_R2
+CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace phi
diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc
index d454fc0734c66..4dba0ab94ff20 100644
--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -140,8 +140,10 @@ std::vector<int> GetXPUSelectedDevices() {
 void MemcpySyncH2D(void* dst,
                    const void* src,
                    size_t count,
-                   const phi::XPUPlace& dst_place) {
+                   const phi::XPUPlace& dst_place,
+                   const phi::XPUContext& dev_ctx) {
   XPUDeviceGuard guard(dst_place.device);
+  dev_ctx.Wait();
   PADDLE_ENFORCE_XPU_SUCCESS(
       xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 }
diff --git a/paddle/phi/backends/xpu/xpu_info.h b/paddle/phi/backends/xpu/xpu_info.h
index fa7d1b5c18a7d..b1056cdc4b14b 100644
--- a/paddle/phi/backends/xpu/xpu_info.h
+++ b/paddle/phi/backends/xpu/xpu_info.h
@@ -49,7 +49,8 @@ std::vector<int> GetXPUSelectedDevices();
 void MemcpySyncH2D(void *dst,
                    const void *src,
                    size_t count,
-                   const phi::XPUPlace &dst_place);
+                   const phi::XPUPlace &dst_place,
+                   const phi::XPUContext &dev_ctx);
 void MemcpySyncD2H(void *dst,
                    const void *src,
                    size_t count,
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index f807f268a2d33..f47e8d550e693 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -27,30 +27,30 @@ limitations under the License. */
 namespace phi {
 
 // tuple(input_names, attr_names, output_names)
-using KernelArgsTuple = std::tuple<paddle::SmallVector<const char*>,
-                                   paddle::SmallVector<const char*>,
-                                   paddle::SmallVector<const char*>>;
+using KernelArgsTuple = std::tuple<paddle::small_vector<const char*>,
+                                   paddle::small_vector<const char*>,
+                                   paddle::small_vector<const char*>>;
 
 struct KernelSignature {
   const char* name;
-  paddle::SmallVector<const char*> input_names;
-  paddle::SmallVector<const char*> attr_names;
-  paddle::SmallVector<const char*> output_names;
+  paddle::small_vector<const char*> input_names;
+  paddle::small_vector<const char*> attr_names;
+  paddle::small_vector<const char*> output_names;
 
   KernelSignature() = default;
 
   KernelSignature(const char* kernel_name,
-                  paddle::SmallVector<const char*>&& inputs,
-                  paddle::SmallVector<const char*>&& attrs,
-                  paddle::SmallVector<const char*>&& outputs)
+                  paddle::small_vector<const char*>&& inputs,
+                  paddle::small_vector<const char*>&& attrs,
+                  paddle::small_vector<const char*>&& outputs)
       : name(kernel_name),
         input_names(std::move(inputs)),
         attr_names(std::move(attrs)),
         output_names(std::move(outputs)) {}
   KernelSignature(const char* kernel_name,
-                  const paddle::SmallVector<const char*>& inputs,
-                  const paddle::SmallVector<const char*>& attrs,
-                  const paddle::SmallVector<const char*>& outputs)
+                  const paddle::small_vector<const char*>& inputs,
+                  const paddle::small_vector<const char*>& attrs,
+                  const paddle::small_vector<const char*>& outputs)
       : name(kernel_name),
         input_names(inputs),
         attr_names(attrs),
@@ -63,6 +63,13 @@ struct KernelSignature {
         input_names(other.input_names),
         attr_names(other.attr_names),
         output_names(other.output_names) {}
+
+  KernelSignature(KernelSignature&& other) noexcept
+      : name(other.name),
+        input_names(std::move(other.input_names)),
+        attr_names(std::move(other.attr_names)),
+        output_names(std::move(other.output_names)) {}
+
   KernelSignature& operator=(const KernelSignature& other) {
     name = other.name;
     input_names = other.input_names;
@@ -70,6 +77,14 @@ struct KernelSignature {
     output_names = other.output_names;
     return *this;
   }
+
+  KernelSignature& operator=(KernelSignature&& other) noexcept {
+    name = other.name;
+    input_names = std::move(other.input_names);
+    attr_names = std::move(other.attr_names);
+    output_names = std::move(other.output_names);
+    return *this;
+  }
 };
 
 std::ostream& operator<<(std::ostream& os, KernelSignature signature);
@@ -91,6 +106,7 @@ class ArgumentMappingContext {
   virtual size_t OutputSize(const std::string& name) const = 0;
 
   virtual bool IsDenseTensorInput(const std::string& name) const = 0;
+  virtual bool IsDenseTensorInputs(const std::string& name) const = 0;
   virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
   // For compatibility with LoDTensorArray
   virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 4388bd1f751cf..18c39bfae1d18 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -28,27 +28,28 @@ namespace phi {
 
 Backend TransToPhiBackend(const phi::Place& place) {
   auto allocation_type = place.GetType();
-  if (allocation_type == phi::AllocationType::CPU) {
-    return Backend::CPU;
-  } else if (allocation_type == phi::AllocationType::GPU) {
-    return Backend::GPU;
-  } else if (allocation_type == phi::AllocationType::GPUPINNED) {
-    return Backend::GPU;
-  } else if (allocation_type == phi::AllocationType::XPU) {
-    return Backend::XPU;
-  } else if (allocation_type == phi::AllocationType::NPU) {
-    return Backend::NPU;
-  } else if (allocation_type == phi::AllocationType::IPU) {
-    return Backend::IPU;
-  } else if (allocation_type == phi::AllocationType::MLU) {
-    return Backend::MLU;
-  } else if (allocation_type == phi::AllocationType::CUSTOM) {
-    return static_cast<Backend>(
-        static_cast<size_t>(Backend::NUM_BACKENDS) +
-        GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
-  } else {
-    PADDLE_THROW(phi::errors::InvalidArgument(
-        "Unsupported transform %s to phi Backend.", place));
+  switch (allocation_type) {
+    case phi::AllocationType::GPU:
+      return Backend::GPU;
+    case AllocationType::CPU:
+      return Backend::CPU;
+    case AllocationType::GPUPINNED:
+      return Backend::GPU;
+    case AllocationType::XPU:
+      return Backend::XPU;
+    case AllocationType::NPU:
+      return Backend::NPU;
+    case AllocationType::IPU:
+      return Backend::IPU;
+    case AllocationType::MLU:
+      return Backend::MLU;
+    case AllocationType::CUSTOM:
+      return static_cast<Backend>(
+          static_cast<size_t>(Backend::NUM_BACKENDS) +
+          GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Unsupported transform %s to phi Backend.", place));
   }
 }
 
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index 2b9a5f5e0ea0c..6c9291f816f7a 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -135,7 +135,6 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
 
 template <typename T>
 const T* DenseTensor::data() const {
-  check_memory_size();
   PADDLE_ENFORCE_EQ(
       dtype(),
       paddle::experimental::CppTypeToDataType<T>::Type(),
@@ -147,13 +146,13 @@ const T* DenseTensor::data() const {
 
 template <typename T>
 T* DenseTensor::data() {
-  check_memory_size();
+  T* ret = static_cast<T*>(data());
   PADDLE_ENFORCE(
       (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
       phi::errors::InvalidArgument(
           "The type of data we are trying to retrieve does not match the "
           "type of data currently contained in the container."));
-  return static_cast<T*>(data());
+  return ret;
 }
 
 void* DenseTensor::data() {
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 46c45837a5372..3c030cac2e7c9 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -371,12 +371,20 @@ dnnl::memory::format_tag DenseTensor::format() const {
 }
 #endif
 
+// NOTE: For historical reasons, this interface has a special behavior,
+// sharing other tensor members except lod
 DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
   src.check_memory_size();
-  // Preserve LoD
-  auto lod = meta_.lod;
-  *this = src;
-  meta_.lod = lod;
+  holder_ = src.holder_;
+  meta_.is_scalar = src.meta_.is_scalar;
+  meta_.dims = src.meta_.dims;
+  meta_.dtype = src.meta_.dtype;
+  meta_.layout = src.meta_.layout;
+  meta_.offset = src.meta_.offset;
+#ifdef PADDLE_WITH_MKLDNN
+  format_ = src.format_;
+  mem_desc_ = src.mem_desc_;
+#endif
   return *this;
 }
 
diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc
index 8bdad9d6d2b6e..1d61f55f9dcd2 100644
--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -35,7 +35,7 @@ void InferMetaContext::EmplaceBackAttr(Attribute attr) {
 }
 
 void InferMetaContext::EmplaceBackInputs(
-    paddle::SmallVector<MetaTensor, phi::kInputSmallVectorSize> inputs) {
+    paddle::small_vector<MetaTensor, phi::kInputSmallVectorSize> inputs) {
   int index = inputs_.size();
   input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
   inputs_.insert(inputs_.end(),
@@ -43,7 +43,7 @@ void InferMetaContext::EmplaceBackInputs(
                  std::make_move_iterator(inputs.end()));
 }
 void InferMetaContext::EmplaceBackOutputs(
-    paddle::SmallVector<MetaTensor, phi::kOutputSmallVectorSize> outputs) {
+    paddle::small_vector<MetaTensor, phi::kOutputSmallVectorSize> outputs) {
   int index = outputs_.size();
   output_range_.emplace_back(
       std::pair<int, int>(index, index + outputs.size()));
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 8c726bffa2fc9..b974f2c868a8a 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -45,9 +45,9 @@ class InferMetaContext {
   void EmplaceBackAttr(Attribute attr);
 
   void EmplaceBackInputs(
-      paddle::SmallVector<MetaTensor, phi::kInputSmallVectorSize> inputs);
+      paddle::small_vector<MetaTensor, phi::kInputSmallVectorSize> inputs);
   void EmplaceBackOutputs(
-      paddle::SmallVector<MetaTensor, phi::kOutputSmallVectorSize> outputs);
+      paddle::small_vector<MetaTensor, phi::kOutputSmallVectorSize> outputs);
 
   virtual const MetaTensor& InputAt(size_t idx) const;
   virtual paddle::optional<const MetaTensor&> OptionalInputAt(size_t idx) const;
@@ -72,16 +72,16 @@ class InferMetaContext {
  protected:
   MetaConfig config_;
 
-  paddle::SmallVector<Attribute, kAttrSmallVectorSize> attrs_;
+  paddle::small_vector<Attribute, kAttrSmallVectorSize> attrs_;
 
-  paddle::SmallVector<std::pair<int, int>, phi::kInputSmallVectorSize>
+  paddle::small_vector<std::pair<int, int>, phi::kInputSmallVectorSize>
       input_range_;
-  paddle::SmallVector<std::pair<int, int>, phi::kOutputSmallVectorSize>
+  paddle::small_vector<std::pair<int, int>, phi::kOutputSmallVectorSize>
       output_range_;
 
  private:
-  paddle::SmallVector<MetaTensor, phi::kInputSmallVectorSize> inputs_;
-  paddle::SmallVector<MetaTensor, phi::kOutputSmallVectorSize> outputs_;
+  paddle::small_vector<MetaTensor, phi::kInputSmallVectorSize> inputs_;
+  paddle::small_vector<MetaTensor, phi::kOutputSmallVectorSize> outputs_;
 };
 
 #define PD_INFER_META(...) \
diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc
index 9935a5bf5cd9f..c902fc824f8d2 100644
--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -28,7 +28,7 @@ void KernelContext::EmplaceBackInputWithoutSetRange(const TensorBase* input) {
 }
 
 void KernelContext::EmplaceBackInputs(
-    paddle::SmallVector<const TensorBase*> inputs) {
+    paddle::small_vector<const TensorBase*> inputs) {
   int index = inputs_.size();
   // Record the start and end index of the input
   input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
@@ -38,7 +38,7 @@ void KernelContext::EmplaceBackInputs(
 }
 
 void KernelContext::EmplaceBackInputsWithoutSetRange(
-    paddle::SmallVector<const TensorBase*> inputs) {
+    paddle::small_vector<const TensorBase*> inputs) {
   inputs_.insert(inputs_.end(),
                  std::make_move_iterator(inputs.begin()),
                  std::make_move_iterator(inputs.end()));
@@ -56,7 +56,7 @@ void KernelContext::EmplaceBackOutputWithoutSetRange(TensorBase* output) {
 }
 
 void KernelContext::EmplaceBackOutputs(
-    paddle::SmallVector<TensorBase*> outputs) {
+    paddle::small_vector<TensorBase*> outputs) {
   int index = outputs_.size();
   // Record the start and end index of the input
   output_range_.emplace_back(
@@ -67,7 +67,7 @@ void KernelContext::EmplaceBackOutputs(
 }
 
 void KernelContext::EmplaceBackOutputsWithoutSetRange(
-    paddle::SmallVector<TensorBase*> outputs) {
+    paddle::small_vector<TensorBase*> outputs) {
   outputs_.insert(outputs_.end(),
                   std::make_move_iterator(outputs.begin()),
                   std::make_move_iterator(outputs.end()));
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index a06efb573a62f..8b43239d352b3 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -51,19 +51,19 @@ class KernelContext {
 
   void EmplaceBackInputWithoutSetRange(const TensorBase* input);
 
-  void EmplaceBackInputs(paddle::SmallVector<const TensorBase*> inputs);
+  void EmplaceBackInputs(paddle::small_vector<const TensorBase*> inputs);
 
   void EmplaceBackInputsWithoutSetRange(
-      paddle::SmallVector<const TensorBase*> inputs);
+      paddle::small_vector<const TensorBase*> inputs);
 
   void EmplaceBackOutput(TensorBase* output);
 
   void EmplaceBackOutputWithoutSetRange(TensorBase* output);
 
-  void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);
+  void EmplaceBackOutputs(paddle::small_vector<TensorBase*> outputs);
 
   void EmplaceBackOutputsWithoutSetRange(
-      paddle::SmallVector<TensorBase*> outputs);
+      paddle::small_vector<TensorBase*> outputs);
 
   void EmplaceBackAttr(Attribute attr);
 
@@ -138,12 +138,12 @@ class KernelContext {
  private:
   DeviceContext* dev_ctx_;
 
-  paddle::SmallVector<const TensorBase*> inputs_;
-  paddle::SmallVector<TensorBase*> outputs_;
-  paddle::SmallVector<Attribute, kAttrSmallVectorSize> attrs_;
+  paddle::small_vector<const TensorBase*> inputs_;
+  paddle::small_vector<TensorBase*> outputs_;
+  paddle::small_vector<Attribute, kAttrSmallVectorSize> attrs_;
 
-  paddle::SmallVector<std::pair<int, int>, kInputSmallVectorSize> input_range_;
-  paddle::SmallVector<std::pair<int, int>, kOutputSmallVectorSize>
+  paddle::small_vector<std::pair<int, int>, kInputSmallVectorSize> input_range_;
+  paddle::small_vector<std::pair<int, int>, kOutputSmallVectorSize>
       output_range_;
 };
 
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 6d71c5016bda4..08329d0c8636a 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -140,6 +140,68 @@ const KernelArgsDef& KernelFactory::GetFirstKernelArgsDef(
   return iter->second.cbegin()->second.args_def();
 }
 
+std::ostream& operator<<(std::ostream& os, AttributeType attr_type) {
+  switch (attr_type) {
+    case AttributeType::BOOL:
+      os << "bool";
+      break;
+    case AttributeType::INT32:
+      os << "int";
+      break;
+    case AttributeType::INT64:
+      os << "int64_t";
+      break;
+    case AttributeType::FLOAT32:
+      os << "float";
+      break;
+    case AttributeType::FLOAT64:
+      os << "double";
+      break;
+    case AttributeType::STRING:
+      os << "string";
+      break;
+    case AttributeType::BOOLS:
+      os << "vector<bool>";
+      break;
+    case AttributeType::INT32S:
+      os << "vector<int>";
+      break;
+    case AttributeType::INT64S:
+      os << "vector<int64_t>";
+      break;
+    case AttributeType::FLOAT32S:
+      os << "vector<float>";
+      break;
+    case AttributeType::FLOAT64S:
+      os << "vector<double>";
+      break;
+    case AttributeType::STRINGS:
+      os << "vector<string>";
+      break;
+    case AttributeType::SCALAR:
+      os << "Scalar";
+      break;
+    case AttributeType::SCALARS:
+      os << "vector<Scalar>";
+      break;
+    case AttributeType::INT_ARRAY:
+      os << "IntArray";
+      break;
+    case AttributeType::DATA_TYPE:
+      os << "DataType";
+      break;
+    case AttributeType::DATA_LAYOUT:
+      os << "DataLayout";
+      break;
+    case AttributeType::PLACE:
+      os << "Place";
+      break;
+    default:
+      os << "Undefined";
+  }
+  return os;
+}
+
 // print kernel info with json format:
 // {
 //   "(CPU, Undefined(AnyLayout), complex64)": {
@@ -175,7 +237,7 @@ std::ostream& operator<<(std::ostream& os, const Kernel& kernel) {
   need_comma = false;
   for (auto& arg_def : kernel.args_def().attribute_defs()) {
     if (need_comma) os << ",";
-    os << "\"" << arg_def.type_index.name() << "\"";
+    os << "\"" << arg_def.type_index << "\"";
     need_comma = true;
   }
   os << "]}";
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index 3ac99a426319d..c4c8274db976c 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -122,11 +122,33 @@ struct TensorArgDef {
   }
 };
 
+// Align the original fluid Attribute type with lower overhead
+enum class AttributeType {
+  UNDEFINED = 0,
+  BOOL,
+  INT32,
+  INT64,
+  FLOAT32,
+  FLOAT64,
+  STRING,
+  BOOLS,
+  INT32S,
+  INT64S,
+  FLOAT32S,
+  FLOAT64S,
+  STRINGS,
+  SCALAR,
+  SCALARS,
+  INT_ARRAY,
+  DATA_TYPE,
+  DATA_LAYOUT,
+  PLACE,
+};
+
 struct AttributeArgDef {
-  std::type_index type_index;
+  AttributeType type_index;
 
-  explicit AttributeArgDef(std::type_index type_index)
-      : type_index(type_index) {}
+  explicit AttributeArgDef(AttributeType type_index) : type_index(type_index) {}
 };
 
 class KernelArgsDef {
@@ -147,41 +169,42 @@ class KernelArgsDef {
     output_defs_.emplace_back(TensorArgDef(backend, layout, dtype, type_index));
   }
 
-  void AppendAttribute(std::type_index type_index) {
+  void AppendAttribute(AttributeType type_index) {
     attribute_defs_.emplace_back(AttributeArgDef(type_index));
   }
 
-  const paddle::SmallVector<TensorArgDef, kInputSmallVectorSize>& input_defs()
+  const paddle::small_vector<TensorArgDef, kInputSmallVectorSize>& input_defs()
       const {
     return input_defs_;
   }
 
-  const paddle::SmallVector<TensorArgDef, kOutputSmallVectorSize>& output_defs()
-      const {
+  const paddle::small_vector<TensorArgDef, kOutputSmallVectorSize>&
+  output_defs() const {
     return output_defs_;
   }
 
-  const paddle::SmallVector<AttributeArgDef, kAttrSmallVectorSize>&
+  const paddle::small_vector<AttributeArgDef, kAttrSmallVectorSize>&
   attribute_defs() const {
     return attribute_defs_;
   }
 
-  paddle::SmallVector<TensorArgDef, kInputSmallVectorSize>& input_defs() {
+  paddle::small_vector<TensorArgDef, kInputSmallVectorSize>& input_defs() {
     return input_defs_;
   }
 
-  paddle::SmallVector<TensorArgDef, kOutputSmallVectorSize>& output_defs() {
+  paddle::small_vector<TensorArgDef, kOutputSmallVectorSize>& output_defs() {
     return output_defs_;
   }
 
-  paddle::SmallVector<AttributeArgDef, kAttrSmallVectorSize>& attribute_defs() {
+  paddle::small_vector<AttributeArgDef, kAttrSmallVectorSize>&
+  attribute_defs() {
     return attribute_defs_;
   }
 
  private:
-  paddle::SmallVector<TensorArgDef, kInputSmallVectorSize> input_defs_{{}};
-  paddle::SmallVector<TensorArgDef, kOutputSmallVectorSize> output_defs_{{}};
-  paddle::SmallVector<AttributeArgDef, kAttrSmallVectorSize> attribute_defs_{
+  paddle::small_vector<TensorArgDef, kInputSmallVectorSize> input_defs_{{}};
+  paddle::small_vector<TensorArgDef, kOutputSmallVectorSize> output_defs_{{}};
+  paddle::small_vector<AttributeArgDef, kAttrSmallVectorSize> attribute_defs_{
       {}};
 };
 
@@ -277,6 +300,8 @@ inline std::ostream& operator<<(std::ostream& os, const KernelKey& kernel_key) {
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, AttributeType attr_type);
+
 std::ostream& operator<<(std::ostream& os, const Kernel& kernel);
 
 std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory);
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 356ab58f40726..36ab9c081cc37 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -163,11 +163,51 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+      } else if (arg_type == std::type_index(typeid(bool))) {
+        args_def->AppendAttribute(AttributeType::BOOL);
+      } else if (arg_type == std::type_index(typeid(int))) {
+        args_def->AppendAttribute(AttributeType::INT32);
+      } else if (arg_type == std::type_index(typeid(int64_t))) {
+        args_def->AppendAttribute(AttributeType::INT64);
+      } else if (arg_type == std::type_index(typeid(float))) {
+        args_def->AppendAttribute(AttributeType::FLOAT32);
+      } else if (arg_type == std::type_index(typeid(double))) {
+        args_def->AppendAttribute(AttributeType::FLOAT64);
+      } else if (arg_type == std::type_index(typeid(std::string))) {
+        args_def->AppendAttribute(AttributeType::STRING);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<bool>&))) {
+        args_def->AppendAttribute(AttributeType::BOOLS);
+      } else if (arg_type == std::type_index(typeid(const std::vector<int>&))) {
+        args_def->AppendAttribute(AttributeType::INT32S);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<int64_t>&))) {
+        args_def->AppendAttribute(AttributeType::INT64S);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<float>&))) {
+        args_def->AppendAttribute(AttributeType::FLOAT32S);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<double>&))) {
+        args_def->AppendAttribute(AttributeType::FLOAT64S);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<std::string>&))) {
+        args_def->AppendAttribute(AttributeType::STRINGS);
+      } else if (arg_type == std::type_index(typeid(const Scalar&))) {
+        args_def->AppendAttribute(AttributeType::SCALAR);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<Scalar>&))) {
+        args_def->AppendAttribute(AttributeType::SCALARS);
+      } else if (arg_type == std::type_index(typeid(const IntArray&))) {
+        args_def->AppendAttribute(AttributeType::INT_ARRAY);
+      } else if (arg_type == std::type_index(typeid(DataType))) {
+        args_def->AppendAttribute(AttributeType::DATA_TYPE);
+      } else if (arg_type == std::type_index(typeid(DataLayout))) {
+        args_def->AppendAttribute(AttributeType::DATA_LAYOUT);
+      } else if (arg_type == std::type_index(typeid(Place))) {
+        args_def->AppendAttribute(AttributeType::PLACE);
       } else {
-        // Attribute deal with
-        // TODO(chenweihang): now here allow any types of attribute, maybe
-        // should add limits here
-        args_def->AppendAttribute(arg_type);
+        PADDLE_THROW(phi::errors::Unavailable(
+            "Unsupported kernel argument type `%s`.", arg_type.name()));
       }
     }
   }
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index ddc58f512bf14..f548d1da2d4e7 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -75,7 +75,7 @@ namespace phi {
                     "Kernel's Input should appear before Attributes."); \
       static_assert(out_idx == 0,                                       \
                     "Kernel's Input should appear before Outputs.");    \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);      \
+      const std::pair<int, int>& range = ctx->InputRangeAt(in_idx);     \
       const tensor_type& arg = ctx->InputAt<tensor_type>(range.first);  \
       KernelCallHelper<Tail...>::                                       \
           template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
@@ -96,7 +96,7 @@ namespace phi {
                     "Kernel's Input should appear before Attributes.");    \
       static_assert(out_idx == 0,                                          \
                     "Kernel's Input should appear before Outputs.");       \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);         \
+      const std::pair<int, int>& range = ctx->InputRangeAt(in_idx);        \
       auto arg = ctx->OptionalInputAt<tensor_type>(range.first);           \
       KernelCallHelper<Tail...>::                                          \
           template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
@@ -117,7 +117,7 @@ namespace phi {
                     "Kernel's Input should appear before Attributes.");      \
       static_assert(out_idx == 0,                                            \
                     "Kernel's Input should appear before Outputs.");         \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);           \
+      const std::pair<int, int>& range = ctx->InputRangeAt(in_idx);          \
       std::vector<const tensor_type*> arg = std::move(                       \
           ctx->InputsBetween<tensor_type>(range.first, range.second));       \
       KernelCallHelper<Tail...>::                                            \
@@ -141,7 +141,7 @@ namespace phi {
                     "Kernel's Input should appear before Attributes.");       \
       static_assert(out_idx == 0,                                             \
                     "Kernel's Input should appear before Outputs.");          \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);            \
+      const std::pair<int, int>& range = ctx->InputRangeAt(in_idx);           \
       paddle::optional<const std::vector<const tensor_type*>> arg =           \
           ctx->OptionalInputsBetween<tensor_type>(range.first, range.second); \
       KernelCallHelper<Tail...>::                                             \
@@ -195,7 +195,7 @@ namespace phi {
               int out_idx,                                               \
               typename... PreviousArgs>                                  \
     static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {    \
-      const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);     \
+      const std::pair<int, int>& range = ctx->OutputRangeAt(out_idx);    \
       tensor_type* arg = ctx->MutableOutputAt<tensor_type>(range.first); \
       KernelCallHelper<Tail...>::                                        \
           template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(  \
@@ -212,7 +212,7 @@ namespace phi {
               int out_idx,                                                    \
               typename... PreviousArgs>                                       \
     static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {         \
-      const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);          \
+      const std::pair<int, int>& range = ctx->OutputRangeAt(out_idx);         \
       std::vector<tensor_type*> arg = std::move(                              \
           ctx->MutableOutputBetween<tensor_type>(range.first, range.second)); \
       KernelCallHelper<Tail...>::                                             \
diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc
index 7d4261ef82972..bf4d601c0b566 100644
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -115,4 +115,12 @@ void SparseCooTensor::SetMember(const DenseTensor& non_zero_indices,
   this->coalesced_ = coalesced;
 }
 
+int32_t SparseCooTensor::sparse_dim() const {
+  return non_zero_indices_.dims()[0];
+}
+
+int32_t SparseCooTensor::dense_dim() const {
+  return dims_.size() - sparse_dim();
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index ec43c5d62179b..c65b5ce57430b 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -150,6 +150,12 @@ class SparseCooTensor : public TensorBase,
   /// \brief set the dims of original dense tensor
   void set_dims(const DDim& dims) { this->dims_ = dims; }
 
+  /// \brief get the sparse dim
+  int32_t sparse_dim() const;
+
+  /// \brief get the dnese dim
+  int32_t dense_dim() const;
+
  private:
   // save the indices of non zero elements in original dense tensor
   DenseTensor non_zero_indices_;
diff --git a/paddle/phi/core/sparse_csr_tensor.cc b/paddle/phi/core/sparse_csr_tensor.cc
index ab9717a564eb5..447fab0e33c5b 100644
--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
@@ -27,9 +27,11 @@ SparseCsrTensor::SparseCsrTensor() {
 inline void check_shape(const DDim& dims) {
   bool valid = dims.size() == 2 || dims.size() == 3;
 
-  PADDLE_ENFORCE(valid,
-                 phi::errors::InvalidArgument(
-                     "the SparseCsrTensor only support 2-D Tensor."));
+  PADDLE_ENFORCE(
+      valid,
+      phi::errors::InvalidArgument("the SparseCsrTensor only support 2-D or "
+                                   "3-D Tensor, but get %d-D Tensor",
+                                   dims.size()));
 }
 #define Check(non_zero_crows, non_zero_cols, non_zero_elements, dims)          \
   {                                                                            \
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 4a4585e00eed6..602942abf4d34 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -443,6 +443,36 @@ void NllLossGradInferMeta(const MetaTensor& x,
   }
 }
 
+void PixelUnshuffleGradInferMeta(const MetaTensor& out_grad,
+                                 int downscale_factor,
+                                 const std::string& data_format,
+                                 MetaTensor* x_grad) {
+  auto do_dims = out_grad.dims();
+  PADDLE_ENFORCE_EQ(do_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input should be a 4-D tensor of format [N, C, H, W] "
+                        "or [N, H, W, C], but got %u.",
+                        do_dims.size()));
+
+  const bool channel_last = (data_format == "NHWC");
+
+  auto dx_dims = do_dims;
+  dx_dims[0] = do_dims[0];
+
+  if (!channel_last) {
+    dx_dims[1] = do_dims[1] / (downscale_factor * downscale_factor);
+    dx_dims[2] = do_dims[2] * downscale_factor;
+    dx_dims[3] = do_dims[3] * downscale_factor;
+  } else {
+    dx_dims[1] = do_dims[1] * downscale_factor;
+    dx_dims[2] = do_dims[2] * downscale_factor;
+    dx_dims[3] = do_dims[3] / (downscale_factor * downscale_factor);
+  }
+  x_grad->set_dims(dx_dims);
+  x_grad->set_dtype(out_grad.dtype());
+}
+
 void PoolGradInferMeta(const MetaTensor& x,
                        const MetaTensor& out,
                        const MetaTensor& dout,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 9db958778d597..c35b58d0f56e4 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -178,6 +178,11 @@ void NllLossGradInferMeta(const MetaTensor& input,
                           MetaTensor* intput_grad,
                           MetaConfig config = MetaConfig());
 
+void PixelUnshuffleGradInferMeta(const MetaTensor& out_grad,
+                                 int downscale_factor,
+                                 const std::string& data_format,
+                                 MetaTensor* x_grad);
+
 void PsroiPoolGradInferMeta(const MetaTensor& x,
                             const MetaTensor& rois,
                             paddle::optional<const MetaTensor&> rois_num,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 5066d0cfd16fa..6d37a31f54562 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/strided_slice.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/impl/einsum_impl.h"
 
 namespace phi {
 
@@ -398,6 +399,47 @@ void EighInferMeta(const MetaTensor& x,
   out_v->set_dims(input_dim);
 }
 
+void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
+                     const std::string& equation,
+                     MetaTensor* out) {
+  // collect the following informations to prepare einsum.
+  LabelMap labelshape(0);
+  LabelMap labeltype(LabelType::Reduction);
+  std::vector<LabelMap> label2perms(inputs.size(), LabelMap(-1));
+  std::vector<char> all_labels;
+  std::vector<int> broadcast_dims;
+  std::vector<int> output_dims;
+  std::vector<std::vector<int>> ellipsis_dims(2);
+
+  std::vector<DDim> input_dims;
+  for (auto& i : inputs) {
+    input_dims.push_back(i->dims());
+  }
+  std::string right;
+  ParseEinsumEquation(equation,
+                      input_dims,
+                      &labelshape,
+                      &labeltype,
+                      &all_labels,
+                      &label2perms,
+                      &ellipsis_dims,
+                      &broadcast_dims,
+                      &output_dims,
+                      &right);
+
+  VLOG(3) << "Einsum Infershape: input dims:"
+          << paddle::string::join_strings(input_dims, "\n");
+  VLOG(3) << "Einsum Infershape: equation:" << equation;
+  VLOG(3) << "Einsum Infershape: all_labels:"
+          << paddle::string::join_strings(all_labels, ",");
+  VLOG(3) << "Einsum Infershape: output dims:"
+          << paddle::string::join_strings(output_dims, ",");
+  VLOG(3) << "Label Type is : " << label_to_string(all_labels, labeltype);
+  VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape);
+  out->set_dims(make_ddim(output_dims));
+  out->set_dtype(inputs[0]->dtype());
+}
+
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
                      MetaTensor* out) {
@@ -1416,6 +1458,66 @@ void PixelShuffleGradInferMeta(const MetaTensor& out_grad,
   x_grad->set_dtype(out_grad.dtype());
 }
 
+void PixelUnshuffleInferMeta(const MetaTensor& x,
+                             int downscale_factor,
+                             const std::string& data_format,
+                             MetaTensor* out) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input should be a 4-D tensor of format [N, C, H, W] "
+                        "or [N, H, W, C], but got %u.",
+                        input_dims.size()));
+  PADDLE_ENFORCE_GE(downscale_factor,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "downscale_factor should be larger than 0."));
+  PADDLE_ENFORCE_EQ(data_format == "NCHW" || data_format == "NHWC",
+                    true,
+                    phi::errors::InvalidArgument(
+                        "data_format must be one of "
+                        "NCHW and NHWC. But recevied data_format: %s",
+                        data_format));
+
+  const bool channel_last = (data_format == "NHWC");
+
+  if (!channel_last) {
+    PADDLE_ENFORCE_EQ(
+        (input_dims[2] % downscale_factor) == 0 &&
+            (input_dims[3] % downscale_factor) == 0,
+        true,
+        phi::errors::InvalidArgument("Downscale factor[%u] should divide both "
+                                     "height[%u] and width[%u]",
+                                     downscale_factor,
+                                     input_dims[2],
+                                     input_dims[3]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        (input_dims[1] % downscale_factor) == 0 &&
+            (input_dims[2] % downscale_factor) == 0,
+        true,
+        phi::errors::InvalidArgument("Downscale factor[%u] should divide both "
+                                     "height[%u] and width[%u]",
+                                     downscale_factor,
+                                     input_dims[1],
+                                     input_dims[2]));
+  }
+  auto output_dims = input_dims;
+  output_dims[0] = input_dims[0];
+  if (!channel_last) {
+    output_dims[1] = input_dims[1] * (downscale_factor * downscale_factor);
+    output_dims[2] = input_dims[2] / downscale_factor;
+    output_dims[3] = input_dims[3] / downscale_factor;
+  } else {
+    output_dims[1] = input_dims[1] / downscale_factor;
+    output_dims[2] = input_dims[2] / downscale_factor;
+    output_dims[3] = input_dims[3] * (downscale_factor * downscale_factor);
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(output_dims);
+}
+
 void PNormInferMeta(const MetaTensor& x,
                     float porder,
                     int axis,
@@ -2260,8 +2362,7 @@ void SumRawInferMeta(const MetaTensor& x,
   if (dtype != DataType::UNDEFINED) {
     out_dtype = dtype;
   } else {
-    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 ||
-        x.dtype() == DataType::INT64) {
+    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32) {
       out_dtype = DataType::INT64;
     } else {
       out_dtype = x.dtype();
@@ -2952,7 +3053,7 @@ void UnStackInferMeta(const MetaTensor& x,
 }
 
 void OneHotRawInferMeta(const MetaTensor& x,
-                        int32_t depth,
+                        const Scalar& depth,
                         DataType dtype,
                         bool allow_out_of_range,
                         MetaTensor* out) {
@@ -2962,7 +3063,7 @@ void OneHotRawInferMeta(const MetaTensor& x,
       1,
       phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
   auto out_dims_vec = phi::vectorize(x_dims);
-  out_dims_vec.push_back(depth);
+  out_dims_vec.push_back(depth.to<int>());
   auto out_dims = phi::make_ddim(out_dims_vec);
   out->set_dims(out_dims);
   out->share_lod(x);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index c67eb2068d8bf..559857bd6ce9b 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -80,6 +80,10 @@ void EighInferMeta(const MetaTensor& x,
                    MetaTensor* out_w,
                    MetaTensor* out_v);
 
+void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
+                     const std::string& equation,
+                     MetaTensor* out);
+
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
                      MetaTensor* out);
@@ -204,6 +208,11 @@ void PixelShuffleGradInferMeta(const MetaTensor& out_grad,
                                const std::string& data_format,
                                MetaTensor* x_grad);
 
+void PixelUnshuffleInferMeta(const MetaTensor& x,
+                             int downscale_factor,
+                             const std::string& data_format,
+                             MetaTensor* out);
+
 void PNormInferMeta(const MetaTensor& x,
                     float porder,
                     int axis,
@@ -426,7 +435,7 @@ void UnStackInferMeta(const MetaTensor& x,
                       std::vector<MetaTensor*> outs);
 
 void OneHotRawInferMeta(const MetaTensor& x,
-                        int32_t depth,
+                        const Scalar& depth,
                         DataType dtype,
                         bool allow_out_of_range,
                         MetaTensor* out);
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index 065d018852267..fd42756ba3867 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -82,18 +82,18 @@ void ReluDoubleGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void TanhDoubleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
-                          const DenseTensor& ddx,
                           const DenseTensor& dout,
+                          const DenseTensor& ddx,
                           DenseTensor* dout_new,
                           DenseTensor* ddout);
 
 template <typename T, typename Context>
 void TanhTripleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
-                          const DenseTensor& ddx,
                           const DenseTensor& dout,
-                          const DenseTensor& d_ddout,
+                          const DenseTensor& ddx,
                           const DenseTensor& d_dout_new,
+                          const DenseTensor& d_ddout,
                           DenseTensor* d_out_new,
                           DenseTensor* d_dout,
                           DenseTensor* d_ddx);
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
index 73752f015ca3a..2cb3b16a022b1 100644
--- a/paddle/phi/kernels/batch_norm_grad_kernel.h
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -66,16 +66,16 @@ void BatchNormGradKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context& dev_ctx,
-                               const DenseTensor& x_grad_grad,
-                               const DenseTensor& scale_grad_grad,
-                               const DenseTensor& bias_grad_grad,
-                               const DenseTensor& y_grad,
                                const DenseTensor& x,
                                const DenseTensor& scale,
-                               const DenseTensor& saved_mean,
-                               const DenseTensor& saved_variance,
                                paddle::optional<const DenseTensor&> mean,
                                paddle::optional<const DenseTensor&> variance,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
                                float momentum,
                                float epsilon,
                                const std::string& data_layout,
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index ae87886b89bff..bf01c24f4ffa3 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -341,16 +341,16 @@ void BatchNormGradKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context& ctx,
-                               const DenseTensor& x_grad_grad,
-                               const DenseTensor& scale_grad_grad,
-                               const DenseTensor& bias_grad_grad,
-                               const DenseTensor& y_grad,
                                const DenseTensor& x,
                                const DenseTensor& scale,
-                               const DenseTensor& saved_mean,
-                               const DenseTensor& saved_variance,
                                paddle::optional<const DenseTensor&> mean,
                                paddle::optional<const DenseTensor&> variance,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
                                float momentum,
                                float epsilon,
                                const std::string& data_layout_str,
diff --git a/paddle/phi/kernels/cpu/einsum_grad_kernel.cc b/paddle/phi/kernels/cpu/einsum_grad_kernel.cc
new file mode 100644
index 0000000000000..2cfc2f92204fc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/einsum_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/einsum_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/einsum_grad_impl.h"
+
+PD_REGISTER_KERNEL(
+    einsum_grad, CPU, ALL_LAYOUT, phi::EinsumGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc
new file mode 100644
index 0000000000000..3e25a65526d89
--- /dev/null
+++ b/paddle/phi/kernels/cpu/einsum_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/einsum_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/einsum_impl.h"
+
+PD_REGISTER_KERNEL(einsum, CPU, ALL_LAYOUT, phi::EinsumKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
index c785eacb9a8bc..b86ead04dbc5f 100644
--- a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
@@ -38,9 +38,9 @@ void SubtractGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
+                              const DenseTensor& dout,
                               paddle::optional<const DenseTensor&> ddx,
                               paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/cpu/one_hot_kernel.cc b/paddle/phi/kernels/cpu/one_hot_kernel.cc
index 04f7c6a1f606d..fc7979e41d938 100644
--- a/paddle/phi/kernels/cpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc
@@ -64,18 +64,19 @@ struct OneHotV2OpFunctor {
 template <typename T, typename Context>
 void OneHotRawKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                     int32_t depth,
+                     const Scalar& depth,
                      DataType dtype,
                      bool allow_out_of_range,
                      DenseTensor* out) {
+  auto depth_v = depth.to<int>();
   auto out_dims = out->dims();
   if (out_dims[out_dims.size() - 1] == -1) {
-    out_dims[out_dims.size() - 1] = depth;
+    out_dims[out_dims.size() - 1] = depth_v;
     out->Resize(out_dims);
   }
 
   phi::VisitDataType(dtype,
-                     OneHotV2OpFunctor<Context, T>(&x, out, depth, dev_ctx));
+                     OneHotV2OpFunctor<Context, T>(&x, out, depth_v, dev_ctx));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc
new file mode 100644
index 0000000000000..ef61fca35957e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pixel_unshuffle_grad_kernel.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pixel_unshuffle_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PixelUnshuffleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc
new file mode 100644
index 0000000000000..9f4bc747f3209
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pixel_unshuffle_kernel.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pixel_unshuffle,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PixelUnshuffleKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/reduce_sum_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_kernel.cc
index 32b12ea684528..0b4c4b9f4705a 100644
--- a/paddle/phi/kernels/cpu/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_sum_kernel.cc
@@ -29,6 +29,9 @@ void SumRawKernel(const Context& dev_ctx,
                   bool reduce_all,
                   DataType out_dtype,
                   DenseTensor* out) {
+  if (out_dtype == DataType::UNDEFINED && out->dtype() != x.dtype()) {
+    out_dtype = out->dtype();
+  }
   phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
diff --git a/paddle/phi/kernels/einsum_grad_kernel.h b/paddle/phi/kernels/einsum_grad_kernel.h
new file mode 100644
index 0000000000000..5c1970e775825
--- /dev/null
+++ b/paddle/phi/kernels/einsum_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EinsumGradKernel(const Context& dev_ctx,
+                      const std::vector<const DenseTensor*>& x,
+                      const DenseTensor& out_grad,
+                      const std::string& equation,
+                      std::vector<DenseTensor*> x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/einsum_kernel.h b/paddle/phi/kernels/einsum_kernel.h
new file mode 100644
index 0000000000000..3d9e8feda748d
--- /dev/null
+++ b/paddle/phi/kernels/einsum_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EinsumKernel(const Context& dev_ctx,
+                  const std::vector<const DenseTensor*>& inputs,
+                  const std::string& equation,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 4cee24d2f8069..9d608cd86a6f7 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -103,7 +103,7 @@ PD_REGISTER_KERNEL(elementwise_pow,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 PD_REGISTER_KERNEL(maximum,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::MaximumKernel,
                    float,
@@ -113,7 +113,7 @@ PD_REGISTER_KERNEL(maximum,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(minimum,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::MinimumKernel,
                    float,
@@ -125,9 +125,9 @@ PD_REGISTER_KERNEL(minimum,
 PD_REGISTER_KERNEL(
     modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 PD_REGISTER_KERNEL(
-    floor_divide, GPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+    floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::ElementwisePowKernel,
                    float,
diff --git a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
index 7be91b4b9f4cd..97df769f4d046 100644
--- a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
@@ -30,9 +30,9 @@ void SubtractGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
+                              const DenseTensor& dout,
                               paddle::optional<const DenseTensor&> ddx,
                               paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout);
 
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 10216f80c00d4..aafa40a3d01bf 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -592,6 +592,7 @@ void BroadcastKernel(const KPDevice &ctx,
                      int axis,
                      Functor func) {
   std::vector<int> dims_size;
+  dims_size.reserve(ins.size());
   bool no_broadcast_flag = true;
   for (auto *in : ins) {
     no_broadcast_flag &= ins[0]->dims() == in->dims();
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 8d9dd65786705..4c2b6ef896e71 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
+#if defined(__xpu__)
+#include <xpu/runtime.h>
+#include "xpu/kernel/math_xpu2.h"  //pow()
+#endif
 
 namespace phi {
 namespace funcs {
@@ -573,6 +577,9 @@ struct ElementwisePowFunctor {
       return std::llrint(
           std::pow(static_cast<double>(a), static_cast<double>(b)));
     }
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+    return pow(a, b);
 #endif
     return std::pow(a, b);
   }
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 1021b510b26cd..7508d8ee8cdc8 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
@@ -978,7 +979,7 @@ static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream,
     // suppose perfoemance improves with h increased.
     dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
     dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
-    auto gplace = phi::GPUPlace();
+    auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
     auto *ctx = static_cast<GPUContext *>(
         paddle::platform::DeviceContextPool::Instance().Get(gplace));
     paddle::platform::LimitGridDim(*ctx, &grid_size);
@@ -1003,7 +1004,7 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
                                        T *dy) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
   dim3 grid_size = dim3(n);
-  auto gplace = phi::GPUPlace();
+  auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
   auto *ctx = static_cast<GPUContext *>(
       paddle::platform::DeviceContextPool::Instance().Get(gplace));
   paddle::platform::LimitGridDim(*ctx, &grid_size);
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index e15b4cc10d97e..ad3b8579ddf67 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -908,16 +908,16 @@ void BatchNormGradKernel(const Context &dev_ctx,
 
 template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context &ctx,
-                               const DenseTensor &x_grad_grad,
-                               const DenseTensor &scale_grad_grad,
-                               const DenseTensor &bias_grad_grad,
-                               const DenseTensor &y_grad,
                                const DenseTensor &x,
                                const DenseTensor &scale,
-                               const DenseTensor &saved_mean,
-                               const DenseTensor &saved_variance,
                                paddle::optional<const DenseTensor &> mean,
                                paddle::optional<const DenseTensor &> variance,
+                               const DenseTensor &saved_mean,
+                               const DenseTensor &saved_variance,
+                               const DenseTensor &y_grad,
+                               const DenseTensor &x_grad_grad,
+                               const DenseTensor &scale_grad_grad,
+                               const DenseTensor &bias_grad_grad,
                                float momentum,
                                float epsilon,
                                const std::string &data_layout_str,
@@ -988,10 +988,9 @@ PD_REGISTER_KERNEL(batch_norm_grad,
                    double,
                    phi::dtype::float16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {
-    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
 }
 
@@ -1003,10 +1002,9 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
                    double,
                    phi::dtype::float16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {
-    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
 }
 
@@ -1019,7 +1017,6 @@ PD_REGISTER_KERNEL(batch_norm_grad_grad,
                    phi::BatchNormDoubleGradKernel,
                    float,
                    double) {}
-
 #else
 PD_REGISTER_KERNEL(batch_norm_grad_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu
index e04f2b5f87658..13975ddd3ef89 100644
--- a/paddle/phi/kernels/gpu/cumsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu
@@ -39,14 +39,12 @@ __device__ void BlockReverse(
   int tx = threadIdx.x;
 
   int offset = tx;
-  int in_index = src_base + offset;
-  if (offset >= valid_item) {
-    sh_mem[offset] = 0;
-  } else {
-    int sh_mem_index = BLOCK_SIZE - offset - 1;
-    T data = idata[in_index];
-    sh_mem[sh_mem_index] = data;
+  T src_data = 0;
+  int src_offset = BLOCK_SIZE - offset - 1;
+  if (src_offset < valid_item) {
+    src_data = idata[src_base + src_offset];
   }
+  sh_mem[offset] = src_data;
 
   __syncthreads();
   int out_index = dst_base - offset;
diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.h b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
similarity index 68%
rename from paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.h
rename to paddle/phi/kernels/gpu/einsum_grad_kernel.cu
index 580fec10f2ac6..c8a8745f34522 100644
--- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.h
+++ b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
@@ -12,19 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#include "paddle/phi/kernels/einsum_kernel.h"
 
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/einsum_grad_impl.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class TransferCastOpPass : public Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
+PD_REGISTER_KERNEL(
+    einsum_grad, GPU, ALL_LAYOUT, phi::EinsumGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu
new file mode 100644
index 0000000000000..d73e154eb40f7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/einsum_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/einsum_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/einsum_impl.h"
+
+PD_REGISTER_KERNEL(einsum, GPU, ALL_LAYOUT, phi::EinsumKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
index 20f3b73e4094f..017616df2782c 100644
--- a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -46,9 +46,9 @@ void SubtractGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
+                              const DenseTensor& dout,
                               paddle::optional<const DenseTensor&> ddx,
                               paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index ef6cd1323a9df..21a506a840cc7 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -133,11 +133,10 @@ void MultinomialKernel(const Context& dev_ctx,
                        DenseTensor* out) {
   auto* in_data = x.data<T>();
   int64_t* out_data = dev_ctx.template Alloc<int64_t>(out);
-
   auto in_dims = x.dims();
-  int64_t in_rank = in_dims.size();
-  const int64_t num_categories = in_dims[in_rank - 1];
-  const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
+  int64_t dim_size = in_dims.size();
+  const int64_t num_categories = in_dims[dim_size - 1];
+  const int64_t num_distributions = dim_size > 1 ? in_dims[dim_size - 2] : 1;
 
   // If replacement is False, it's not a replaceable sample. Every category
   // can be used only once.
@@ -145,8 +144,8 @@ void MultinomialKernel(const Context& dev_ctx,
     int64_t in_data_numel = x.numel();
     int64_t out_data_numel = out->numel();
 
+    // Just use to PADDLE_ENFORCE error message
     T* cpu_in_data = new T[in_data_numel];
-    int64_t* cpu_out_data = new int64_t[out_data_numel];
 
 #ifdef PADDLE_WITH_HIP
     hipMemcpy(
@@ -160,7 +159,7 @@ void MultinomialKernel(const Context& dev_ctx,
     for (size_t i = 0; i < num_distributions; ++i) {
       int zero_num = 0;
       for (size_t j = 0; j < num_categories; ++j) {
-        T weight = cpu_in_data[i * num_distributions + j];
+        T weight = cpu_in_data[i * num_categories + j];
         PADDLE_ENFORCE_GE(
             weight,
             0,
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
index c5884884231a8..2ae9e9333ecb5 100644
--- a/paddle/phi/kernels/gpu/one_hot_kernel.cu
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -73,18 +73,19 @@ struct OneHotV2OpCUDAFunctor {
 template <typename T, typename Context>
 void OneHotRawKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                     int32_t depth,
+                     const Scalar& depth,
                      DataType dtype,
                      bool allow_out_of_range,
                      DenseTensor* out) {
+  auto depth_v = depth.to<int>();
   auto out_dims = out->dims();
   if (out_dims[out_dims.size() - 1] == -1) {
-    out_dims[out_dims.size() - 1] = depth;
+    out_dims[out_dims.size() - 1] = depth_v;
     out->Resize(out_dims);
   }
 
   phi::VisitDataType(
-      dtype, OneHotV2OpCUDAFunctor<Context, T>(&x, out, depth, dev_ctx));
+      dtype, OneHotV2OpCUDAFunctor<Context, T>(&x, out, depth_v, dev_ctx));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
new file mode 100644
index 0000000000000..9cbbc5072aa25
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_unshuffle_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pixel_unshuffle_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PixelUnshuffleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
new file mode 100644
index 0000000000000..ca2e520ffde10
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_unshuffle_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(pixel_unshuffle,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PixelUnshuffleKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 4e488ed470df9..94f063512c06f 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -36,26 +36,29 @@ DECLARE_bool(use_curand);
 
 namespace phi {
 
-template <typename T>
-__global__ void SwapRepeatKernel(
-    int* key, T* data, int n, uint64_t seed, uint64_t offset) {
+template <typename keyT, typename dataT>
+__global__ void SwapRepeatKernel(keyT* key_out_data,
+                                 dataT* out_data,
+                                 int n,
+                                 uint64_t seed,
+                                 uint64_t offset) {
   size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
-  if (idx < n) return;
+  if (idx >= n - 1) return;  // out of range
 
-  bool first_repeat = false;
-  if (data[idx] == data[idx + 1]) {
+  bool is_first_repeat = false;
+  if (key_out_data[idx] == key_out_data[idx + 1]) {
     if (idx == 0) {
-      first_repeat = true;
-    } else if (data[idx] != data[idx - 1]) {
-      first_repeat = true;
+      is_first_repeat = true;
+    } else if (key_out_data[idx] != key_out_data[idx - 1]) {
+      is_first_repeat = true;
     }
   }
 
-  if (!first_repeat) return;
+  if (!is_first_repeat) return;
 
   int repeat_size = 1;
   for (int i = idx; i < n; ++i) {
-    if (data[i] == data[i + 1]) {
+    if (key_out_data[i] == key_out_data[i + 1]) {
       ++repeat_size;
     } else {
       break;
@@ -74,9 +77,9 @@ __global__ void SwapRepeatKernel(
     uint32_t r = hiprand(&state) % (i + 1);
 #endif
     if (r != i) {
-      T tmp = data[idx + i];
-      data[idx + i] = data[idx + r];
-      data[idx + r] = tmp;
+      dataT tmp = out_data[idx + i];
+      out_data[idx + i] = out_data[idx + r];
+      out_data[idx + r] = tmp;
     }
   }
 }
@@ -138,10 +141,10 @@ void RandpermRawKernel(
   auto seed_offset = gen_cuda->IncrementOffset(n);
 
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
-  SwapRepeatKernel<T><<<config.block_per_grid.x,
-                        config.thread_per_block.x,
-                        0,
-                        dev_ctx.stream()>>>(
+  SwapRepeatKernel<<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(
       key_out.data<int>(), out_data, n, seed_offset.first, seed_offset.second);
 }
 
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index bf9b7cdf559d3..2f35acc095085 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -152,8 +152,8 @@ void LeakyReluDoubleGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void TanhDoubleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
-                          const DenseTensor& ddx,
                           const DenseTensor& dout,
+                          const DenseTensor& ddx,
                           DenseTensor* dout_new,
                           DenseTensor* ddout) {
   if (dout_new) {
@@ -171,10 +171,10 @@ void TanhDoubleGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void TanhTripleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
-                          const DenseTensor& ddx,
                           const DenseTensor& dout,
-                          const DenseTensor& d_ddout,
+                          const DenseTensor& ddx,
                           const DenseTensor& d_dout_new,
+                          const DenseTensor& d_ddout,
                           DenseTensor* d_out_new,
                           DenseTensor* d_dout,
                           DenseTensor* d_ddx) {
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
new file mode 100644
index 0000000000000..bd0143379ce15
--- /dev/null
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -0,0 +1,181 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/impl/einsum_impl.h"
+#include "paddle/phi/kernels/tile_kernel.h"
+#include "paddle/utils/string/string_helper.h"
+
+namespace phi {
+template <typename T, typename Context>
+DenseTensor PerformTileAndReduction(const Context& dev_ctx,
+                                    const LabelMap& label2type,
+                                    const LabelMap& label2shape,
+                                    const std::vector<int>& broadcast_dims,
+                                    const std::vector<int>& ellipsis_dims,
+                                    std::string op_label,  // value pass
+                                    DenseTensor& t) {      // NOLINT
+  ReplaceEllipsis(op_label);
+  DenseTensor ret;
+  std::vector<int> repeat_times;
+  std::vector<int> resize_dims;
+  std::vector<int> recover_shape;
+  for (int c : op_label) {
+    if (label2type[c] == LabelType::Reduction) {
+      // '.' can't be Reduction, so we don't deal '.' here.
+      repeat_times.push_back(label2shape[c]);
+      resize_dims.push_back(1);
+      recover_shape.push_back(label2shape[c]);
+    } else {
+      if (c != '.') {
+        resize_dims.push_back(label2shape[c]);
+        repeat_times.push_back(1);
+        recover_shape.push_back(label2shape[c]);
+      } else {
+        int n_dims = broadcast_dims.size();
+        resize_dims.insert(
+            resize_dims.end(), broadcast_dims.begin(), broadcast_dims.end());
+        recover_shape.insert(
+            recover_shape.end(), ellipsis_dims.begin(), ellipsis_dims.end());
+        while (n_dims--) repeat_times.push_back(1);
+      }
+    }
+  }
+  t.Resize(make_ddim(resize_dims));
+  DenseTensor after_tile;
+  TileKernel<T, Context>(dev_ctx, t, repeat_times, &after_tile);
+  size_t n_ellipsis_idx = op_label.find(".", 0);
+  if (n_ellipsis_idx != std::string::npos) {
+    // may be we need reduce. broadcast_dims is not equal to ellipsis dims.
+    std::vector<int64_t> to_reduce;
+    for (size_t i = 0; i < broadcast_dims.size() - ellipsis_dims.size(); ++i)
+      to_reduce.push_back(i + n_ellipsis_idx);
+
+    int new_offset =
+        n_ellipsis_idx + broadcast_dims.size() - ellipsis_dims.size();
+    for (size_t i = 0; i < ellipsis_dims.size(); ++i)
+      if (ellipsis_dims[i] == 1) to_reduce.push_back(i + new_offset);
+
+    VLOG(5) << "PermformTileAndReduction: reduce sum axis: "
+            << paddle::string::join_strings(to_reduce, ",");
+    if (to_reduce.size() != 0) {
+      ret = Sum<T, Context>(dev_ctx,
+                            after_tile,
+                            to_reduce,
+                            after_tile.dtype(),
+                            false);  // not keep dim.
+    } else {
+      ret = after_tile;
+    }
+  } else {
+    ret = after_tile;
+  }
+  VLOG(5) << "PermformTileAndReduction: recover shape: "
+          << paddle::string::join_strings(recover_shape, ",");
+  ret.Resize(make_ddim(recover_shape));
+  return ret;
+}
+
+template <typename T, typename Context>
+void EinsumGradKernel(const Context& dev_ctx,
+                      const std::vector<const DenseTensor*>& x,
+                      const DenseTensor& out_grad,
+                      const std::string& equation,
+                      std::vector<DenseTensor*> x_grad) {
+  VLOG(5) << "Start EisumGradKernel:";
+  LabelMap labelshape(0);
+  LabelMap labeltype(LabelType::Reduction);
+  std::vector<LabelMap> label2perms(x.size(), LabelMap(-1));
+  std::vector<char> all_labels;  // order: ABO, AO, BO, AB, Reduce
+  std::vector<std::vector<int>> ellipsis_dims(2);
+  std::vector<int> broadcast_dims;
+  std::vector<int> output_dims;
+
+  std::vector<DDim> input_dims;
+  for (auto& i : x) {
+    input_dims.push_back(i->dims());
+  }
+  std::string right;
+  ParseEinsumEquation(equation,
+                      input_dims,
+                      &labelshape,
+                      &labeltype,
+                      &all_labels,
+                      &label2perms,
+                      &ellipsis_dims,
+                      &broadcast_dims,
+                      &output_dims,
+                      &right);
+
+  auto gather_labels_except_reduction = [&labeltype](std::string all) {
+    std::string res("");
+    for (auto c : all)
+      if (labeltype[static_cast<int>(c)] != LabelType::Reduction) res += c;
+    return res;
+  };
+  if (x.size() == 1) {  // Unary
+    auto splits = paddle::string::split_string(equation, "->");
+    auto left = splits[0];
+    right = splits[1].substr(1);
+    auto new_equation = right + "->" + gather_labels_except_reduction(left);
+    auto new_operands = std::vector<const DenseTensor*>();
+    new_operands.push_back(&out_grad);
+    DenseTensor before_tile;
+    EinsumKernel<T, Context>(dev_ctx, new_operands, new_equation, &before_tile);
+    *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                       labeltype,
+                                                       labelshape,
+                                                       broadcast_dims,
+                                                       ellipsis_dims[0],
+                                                       left,
+                                                       before_tile);
+  } else {
+    auto splits = paddle::string::split_string(equation, "->");
+    auto left = splits[0];
+    auto ops = paddle::string::split_string(left, ",");
+    right = splits[1].substr(1);
+
+    auto equation_for_A =
+        right + "," + ops[1] + "->" + gather_labels_except_reduction(ops[0]);
+    auto equation_for_B =
+        right + "," + ops[0] + "->" + gather_labels_except_reduction(ops[1]);
+    auto operands_for_A = std::vector<const DenseTensor*>();
+    auto operands_for_B = std::vector<const DenseTensor*>();
+    DenseTensor dA, dB;
+    operands_for_A.push_back(&out_grad);
+    operands_for_A.push_back(x[1]);
+    operands_for_B.push_back(&out_grad);
+    operands_for_B.push_back(x[0]);
+
+    DenseTensor before_tile;
+    EinsumKernel<T, Context>(dev_ctx, operands_for_A, equation_for_A, &dA);
+    EinsumKernel<T, Context>(dev_ctx, operands_for_B, equation_for_B, &dB);
+    *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                       labeltype,
+                                                       labelshape,
+                                                       broadcast_dims,
+                                                       ellipsis_dims[0],
+                                                       ops[0],
+                                                       dA);
+    *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                       labeltype,
+                                                       labelshape,
+                                                       broadcast_dims,
+                                                       ellipsis_dims[1],
+                                                       ops[1],
+                                                       dB);
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
new file mode 100644
index 0000000000000..73940a45cbde2
--- /dev/null
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -0,0 +1,548 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+#include "paddle/utils/string/string_helper.h"
+
+namespace phi {
+
+// check the validation of the Einsum equation.
+// 1. the label must between 'a' - 'z'.
+// 2. the dim of the same label must be same.
+// 3. the broad cast dims in two operands is broadcastable.
+// 4. there must exist '->' and the default output is complete in python.
+// may be we can skip validation check in C++ and just put it in python.
+inline static void ValidationCheck(const std::string& equation) {
+  auto n_part = paddle::string::split_string(equation, "->").size();
+  PADDLE_ENFORCE_EQ(n_part,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "Required at least one `->` in equation of EinsumOp."));
+  size_t pos;
+  auto trimed_equ = equation;
+  if ((pos = trimed_equ.find("->", 0)) != std::string::npos) {
+    trimed_equ.replace(pos, 2, ".");
+  }
+  auto is_valid_char = [](char c) {
+    if (c >= 'a' && c <= 'z') return true;
+    if (c == '.' || c == ',') return true;
+    return false;
+  };
+  for (auto c : trimed_equ) {
+    if (!is_valid_char(c))
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Found invalid char in equation. Einsum only accept `a`-`z` and `...`"
+          "but get:`%c`",
+          c));
+  }
+}
+
+enum LabelType {
+  ALL_TYPE = 0,
+  Batch = 1,    // ABO
+  Free,         // AO, BO
+  Contraction,  // AB
+  Reduction,    // A, B
+};
+
+// map a label('a' - 'z') -> int, O(1) speed.
+class LabelMap {
+  constexpr static int N =
+      26 + 1;  // 'a' - 'z' + '.', '.' is for broadcast dims
+  int default_value;
+  int map[N];
+
+ public:
+  explicit LabelMap(int default_value = 0) {
+    this->default_value = default_value;
+    for (int i = 0; i < N; ++i) map[i] = default_value;
+  }
+  int& operator[](int label) {
+    int i = label - 'a';
+    if (label == '.') i = N - 1;
+    return map[i];
+  }
+  int operator[](int label) const {
+    int i = label - 'a';
+    if (label == '.') i = N - 1;
+    return map[i];
+  }
+  // non-exist is present by is_default
+  bool is_default(char label) {
+    return (*this)[static_cast<int>(label)] == default_value;
+  }
+};
+
+inline std::string label_to_string(const std::vector<char>& all_labels,
+                                   const LabelMap& label2type) {
+  std::string str;
+  for (int a : all_labels) {
+    std::stringstream ss;
+    ss << label2type[a];
+    str += ss.str();
+  }
+  return str;
+}
+
+inline static void ReplaceEllipsis(std::string& s) {  // NOLINT
+  size_t pos;
+  if ((pos = s.find("...", 0)) != std::string::npos) {
+    s.replace(pos, 3, ".");
+  }
+  // remove all the space in the expression
+  while ((pos = s.find(" ", 0)) != std::string::npos) {
+    s.replace(pos, 1, "");
+  }
+}
+
+inline std::vector<char> union_labels(const std::vector<char>& a,
+                                      const std::vector<char>& b) {
+  LabelMap counter(0);
+  std::vector<char> res;
+  auto f = [&](char c) {
+    if (counter[static_cast<int>(c)] == 0) {
+      res.push_back(c);
+    }
+    counter[static_cast<int>(c)] += 1;
+  };
+  std::for_each(a.begin(), a.end(), f);
+  std::for_each(b.begin(), b.end(), f);
+  return res;
+}
+
+inline static void GlobalInfo(const std::vector<std::string>& op_labels,
+                              const std::string& right,
+                              LabelMap* label2type,
+                              std::vector<char>* sorted_labels) {
+  // sorted_labels: ['.', <right>, <left only label>]
+  VLOG(5) << "GlobalInfo: "
+          << paddle::string::join_strings(*sorted_labels, ",");
+  std::vector<char> all;
+  LabelMap counter(0);
+  for (auto& ch : right) {  // char
+    int c = ch;
+    (*label2type)[c] = LabelType::Free;
+  }
+
+  for (auto& op : op_labels) {
+    for (auto& ch : op) {  // char
+      int c = ch;
+      if (counter.is_default(c)) {
+        all.push_back(ch);
+      }
+      counter[c] += 1;
+      if ((*label2type)[c] != LabelType::Free && counter[c] == 2)
+        (*label2type)[c] = LabelType::Contraction;
+      else if (counter[c] == 2)
+        (*label2type)[c] = LabelType::Batch;
+    }
+  }
+  (*label2type)['.'] = LabelType::Batch;
+  std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) {
+    if ((*label2type)[c] == LabelType::Batch)
+      sorted_labels->push_back(static_cast<char>(c));
+  });
+  std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) {
+    if ((*label2type)[c] == LabelType::Free)
+      sorted_labels->push_back(static_cast<char>(c));
+  });
+  std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) {
+    if ((*label2type)[c] == LabelType::Contraction)
+      sorted_labels->push_back(static_cast<char>(c));
+  });
+  std::for_each(all.begin(), all.end(), [&sorted_labels, label2type](int c) {
+    if ((*label2type)[c] == LabelType::Reduction)
+      sorted_labels->push_back(static_cast<char>(c));
+  });
+  VLOG(5) << "GlobalInfo: sorted_labels before: "
+          << paddle::string::join_strings(*sorted_labels, ",");
+  if (counter[static_cast<int>('.')] > 0) {
+    std::vector<char> tmp;
+    tmp.push_back('.');
+    // push '.' in the front
+    *sorted_labels = union_labels(tmp, *sorted_labels);
+    VLOG(5) << "GlobalInfo: sorted_labels after: "
+            << paddle::string::join_strings(*sorted_labels, ",");
+  }
+}
+
+inline static void InferLabelShape(const std::vector<std::string>& op_labels,
+                                   const std::vector<DDim>& inputs,
+                                   LabelMap* labelshape,
+                                   std::vector<std::vector<int>>* ellipsis_dims,
+                                   std::vector<int>* broadcast_dims) {
+  VLOG(5) << "Start InferLabelShape";
+  int n_broadcast_dims = 0;
+  for (size_t i = 0; i < op_labels.size(); ++i) {
+    VLOG(5) << "oplabels: " << op_labels[i];
+    int valid_indices = std::count_if(op_labels[i].begin(),
+                                      op_labels[i].end(),
+                                      [](char c) { return c != '.'; });
+    int n_ellipsis = inputs[i].size() - valid_indices;
+    VLOG(5) << "valid indices and n_ellipsis: " << valid_indices << " "
+            << n_ellipsis;
+    ellipsis_dims->at(i).resize(n_ellipsis);
+    n_broadcast_dims = std::max(n_broadcast_dims, n_ellipsis);
+  }
+  VLOG(5) << "InferLabelShape: Broadcast ndims:" << n_broadcast_dims;
+  *broadcast_dims = std::vector<int>(n_broadcast_dims, 1);
+
+  for (size_t i = 0; i < op_labels.size(); ++i) {
+    auto& op_str = op_labels[i];
+    auto& op_dim = inputs[i];
+    int dim_ptr = 0;
+    for (int c : op_str) {
+      if (c == '.') {
+        for (auto& v : ellipsis_dims->at(i)) {
+          v = op_dim[dim_ptr];
+          dim_ptr++;
+        }
+      } else if (labelshape->is_default(c) || (*labelshape)[c] == -1) {
+        (*labelshape)[c] = op_dim[dim_ptr];
+        dim_ptr++;
+      } else {
+        PADDLE_ENFORCE_EQ(
+            (*labelshape)[c],
+            op_dim[dim_ptr],
+            phi::errors::InvalidArgument(
+                "Same label have different shapes for label: `%c`", c));
+        dim_ptr++;
+      }
+    }
+  }
+  for (size_t i = 0; i < op_labels.size(); ++i) {
+    VLOG(5) << "InferLabelShape: Ellipsis ndims:"
+            << paddle::string::join_strings(ellipsis_dims->at(i), ",");
+    int idx = n_broadcast_dims - ellipsis_dims->at(i).size();
+    for (auto v : ellipsis_dims->at(i)) {
+      PADDLE_ENFORCE_EQ(
+          v == 1 || broadcast_dims->at(idx) == 1 ||
+              broadcast_dims->at(idx) == v,
+          true,
+          phi::errors::InvalidArgument(
+              "Ellipsis dims can't broadcasts. Please Check you operands."));
+      broadcast_dims->at(idx) = std::max(v, broadcast_dims->at(idx));
+      idx += 1;
+    }
+  }
+  VLOG(5) << "InferLabelShape: Broadcast dims:"
+          << paddle::string::join_strings(*broadcast_dims, ",");
+}
+
+inline static void InferLabelPerm(const std::string& op,
+                                  int n_broadcast,
+                                  LabelMap* label2perm) {
+  int cur = 0;
+  for (int c : op) {
+    (*label2perm)[c] = cur;
+    if (c == '.') {
+      cur += n_broadcast;
+    } else {
+      cur += 1;
+    }
+  }
+}
+
+inline static void InferOutputDims(const std::string& right,
+                                   const std::vector<int>& broadcast_dims,
+                                   const LabelMap& labelshape,
+                                   std::vector<int>* output_dims) {
+  for (int c : right) {
+    if (c == '.') {
+      output_dims->insert(
+          output_dims->end(), broadcast_dims.begin(), broadcast_dims.end());
+    } else {
+      output_dims->push_back(labelshape[c]);
+    }
+  }
+}
+//
+inline static void ParseEinsumEquation(
+    const std::string& equation,
+    const std::vector<DDim>& inputs,
+    LabelMap* labelshape,
+    LabelMap* labeltype,
+    std::vector<char>* all_labels,
+    std::vector<LabelMap>* label2perms,
+    std::vector<std::vector<int>>* ellipsis_dims,
+    std::vector<int>* broadcast_dims,
+    std::vector<int>* output_dims,
+    std::string* right) {
+  auto results = paddle::string::split_string(equation, "->");
+  auto left = results[0];
+  ReplaceEllipsis(left);
+  *right = results[1].substr(1);
+  ReplaceEllipsis(*right);
+  auto op_labels = paddle::string::split_string(left, ",");
+  std::for_each(op_labels.begin(), op_labels.end(), ReplaceEllipsis);
+  GlobalInfo(op_labels, *right, labeltype, all_labels);
+  InferLabelShape(op_labels, inputs, labelshape, ellipsis_dims, broadcast_dims);
+  VLOG(5) << "Einsum Infershape: right:" << right;
+  VLOG(5) << "Einsum Infershape: op_labels:"
+          << paddle::string::join_strings(op_labels, "\n");
+  InferOutputDims(*right, *broadcast_dims, *labelshape, output_dims);
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    InferLabelPerm(
+        op_labels[i], ellipsis_dims->at(i).size(), &((*label2perms)[i]));
+  }
+}
+
+template <typename T>
+std::vector<T> GetLabelIndexByType(const std::vector<char>& all_labels,
+                                   const LabelMap& type,
+                                   const LabelMap& perm,
+                                   const std::vector<int>& ellipsis,
+                                   LabelType filter) {
+  std::vector<T> res;
+  for (T c : all_labels) {
+    if ((filter == LabelType::ALL_TYPE || type[c] == filter) && perm[c] != -1) {
+      if (c == '.') {
+        for (size_t i = 0; i < ellipsis.size(); ++i) res.push_back(perm[c] + i);
+      } else {
+        res.push_back(perm[c]);
+      }
+    }
+  }
+  return res;
+}
+
+template <typename T>
+std::vector<T> GetShapeByType(const std::vector<char>& all_labels,
+                              const LabelMap& type,
+                              const LabelMap& perm,
+                              const LabelMap& label2shape,
+                              const std::vector<int>& ellipsis,
+                              LabelType filter) {
+  std::vector<T> res;
+  for (T c : all_labels) {
+    if ((filter == LabelType::ALL_TYPE || type[c] == filter) && perm[c] != -1) {
+      if (c == '.')
+        res.insert(res.end(), ellipsis.begin(), ellipsis.end());
+      else
+        res.push_back(label2shape[c]);
+    }
+  }
+  return res;
+}
+
+template <typename T, typename Context>
+DenseTensor PerformReduction(const Context& dev_ctx,
+                             const DenseTensor& tensor,
+                             const LabelMap& label2perm,
+                             const std::vector<char>& all_labels,
+                             const std::vector<int>& ellipsis,
+                             const LabelMap& label2type) {
+  auto indices = GetLabelIndexByType<int64_t>(
+      all_labels, label2type, label2perm, ellipsis, LabelType::Reduction);
+  VLOG(5) << "call PerformReduction: with axis: "
+          << paddle::string::join_strings(indices, ",");
+  if (indices.size() == 0) return tensor;
+  return Sum<T, Context>(dev_ctx, tensor, indices, tensor.dtype(), true);
+}
+
+inline bool is_no_need_transpose(const std::vector<int>& axis) {
+  for (size_t i = 0; i < axis.size(); ++i) {
+    if (i != static_cast<size_t>(axis[i])) return false;
+  }
+  return true;
+}
+
+template <typename T, typename Context>
+DenseTensor PerformTranspose(const Context& dev_ctx,
+                             const DenseTensor& tensor,
+                             const LabelMap& label2perm,
+                             const std::vector<char>& all_labels,
+                             const std::vector<int>& ellipsis,
+                             const LabelMap& label2type) {
+  auto axis = GetLabelIndexByType<int>(
+      all_labels, label2type, label2perm, ellipsis, LabelType::ALL_TYPE);
+  VLOG(5) << "PerformTranspose: " << paddle::string::join_strings(axis, ",");
+  if (is_no_need_transpose(axis)) {
+    return tensor;
+  }
+  auto ret = Transpose<T, Context>(dev_ctx, tensor, axis);
+  VLOG(5) << "PerformTranspose: do_transpose()";
+  return ret;
+}
+
+template <typename T, typename Context>
+DenseTensor PerformContraction(
+    const Context& dev_ctx,
+    const DenseTensor& A,
+    const DenseTensor& B,
+    const std::vector<LabelMap>& label2perm,
+    const std::vector<char>& all_labels,
+    const LabelMap& label2type,
+    const LabelMap& label2shape,
+    const std::vector<std::vector<int>>& ellipsis_dims,
+    const std::vector<int>& broadcast_dims) {
+  // Get All the Batches, so perm is
+  auto all_valid = LabelMap(1);
+  auto recover_dim = GetShapeByType<int>(all_labels,
+                                         label2type,
+                                         all_valid,
+                                         label2shape,
+                                         broadcast_dims,
+                                         LabelType::Batch);
+  auto preprocess = [&](const DenseTensor& t,
+                        const LabelMap& perm,
+                        const std::vector<int>& ellipsis) -> DenseTensor {
+    auto frees = GetShapeByType<int>(
+        all_labels, label2type, perm, label2shape, ellipsis, LabelType::Free);
+    auto conts = GetShapeByType<int>(all_labels,
+                                     label2type,
+                                     perm,
+                                     label2shape,
+                                     ellipsis,
+                                     LabelType::Contraction);
+    auto trans_t = PerformTranspose<T, Context>(
+        dev_ctx, t, perm, all_labels, ellipsis, label2type);
+    auto mul_dims = GetShapeByType<int>(
+        all_labels, label2type, perm, label2shape, ellipsis, LabelType::Batch);
+    recover_dim.insert(recover_dim.end(), frees.begin(), frees.end());
+    mul_dims.push_back(
+        std::accumulate(frees.begin(), frees.end(), 1, std::multiplies<int>()));
+    mul_dims.push_back(
+        std::accumulate(conts.begin(), conts.end(), 1, std::multiplies<int>()));
+    VLOG(5) << "PerformContraction: mul_dims: "
+            << paddle::string::join_strings(mul_dims, ",");
+    trans_t.Resize(make_ddim(mul_dims));
+    return trans_t;
+  };
+  auto trans_a = preprocess(A, label2perm[0], ellipsis_dims[0]);
+  auto trans_b = preprocess(B, label2perm[1], ellipsis_dims[1]);
+  auto after_contraction =
+      Matmul<T, Context>(dev_ctx, trans_a, trans_b, false, true);
+  VLOG(5) << "PerformContraction: recover_dim: "
+          << paddle::string::join_strings(recover_dim, ",");
+  after_contraction.Resize(make_ddim(recover_dim));
+  return after_contraction;
+}
+
+template <typename T, typename Context>
+void TransposeToOutput(const Context& dev_ctx,
+                       const DenseTensor& to_trans,
+                       const std::string& right,
+                       const std::vector<char>& all_labels,
+                       int n_broadcast_dims,
+                       DenseTensor* output) {
+  std::vector<int> axis;
+  int offset = 0;
+  if (std::find(all_labels.begin(), all_labels.end(), '.') !=
+      all_labels.end()) {
+    offset = n_broadcast_dims - 1;
+  }
+  for (char c : right) {
+    if (c == '.') {
+      for (int i = 0; i < n_broadcast_dims; ++i) axis.push_back(i);
+    } else {
+      auto it = std::find(all_labels.begin(), all_labels.end(), c);
+      PADDLE_ENFORCE_NE(it,
+                        all_labels.end(),
+                        phi::errors::InvalidArgument("Must in all_labels."));
+      axis.push_back(it - all_labels.begin() + offset);
+    }
+  }
+  if (is_no_need_transpose(axis)) return output->ShareBufferWith(to_trans);
+  VLOG(5) << "call TransposeToOutput: with axis: "
+          << paddle::string::join_strings(axis, ",");
+  return TransposeKernel<T, Context>(dev_ctx, to_trans, axis, output);
+}
+
+template <typename T, typename Context>
+void EinsumKernel(const Context& dev_ctx,
+                  const std::vector<const DenseTensor*>& inputs,
+                  const std::string& equation,
+                  DenseTensor* out) {
+  ValidationCheck(equation);
+  // collect the following informations to prepare einsum.
+  LabelMap labelshape(0);
+  LabelMap labeltype(LabelType::Reduction);
+  std::vector<LabelMap> label2perms(inputs.size(), LabelMap(-1));
+  std::vector<char> all_labels;  // order: ABO, AO, BO, AB, Reduce
+  std::vector<std::vector<int>> ellipsis_dims(2);
+  std::vector<int> broadcast_dims;
+  std::vector<int> output_dims;
+
+  std::vector<DDim> input_dims;
+  for (auto& i : inputs) {
+    input_dims.push_back(i->dims());
+  }
+  std::string right;
+  ParseEinsumEquation(equation,
+                      input_dims,
+                      &labelshape,
+                      &labeltype,
+                      &all_labels,
+                      &label2perms,
+                      &ellipsis_dims,
+                      &broadcast_dims,
+                      &output_dims,
+                      &right);
+  out->Resize(make_ddim(output_dims));
+  if (inputs.size() == 2) {
+    auto& A = inputs[0];
+    auto& B = inputs[1];
+    // Reduce Procedure
+    auto reduce_A = PerformReduction<T, Context>(
+        dev_ctx, *A, label2perms[0], all_labels, ellipsis_dims[0], labeltype);
+    auto reduce_B = PerformReduction<T, Context>(
+        dev_ctx, *B, label2perms[1], all_labels, ellipsis_dims[1], labeltype);
+    // Contract Procedure
+    dev_ctx.template Alloc<T>(out);
+    auto after_contraction = PerformContraction<T, Context>(dev_ctx,
+                                                            reduce_A,
+                                                            reduce_B,
+                                                            label2perms,
+                                                            all_labels,
+                                                            labeltype,
+                                                            labelshape,
+                                                            ellipsis_dims,
+                                                            broadcast_dims);
+    TransposeToOutput<T, Context>(dev_ctx,
+                                  after_contraction,
+                                  right,
+                                  all_labels,
+                                  broadcast_dims.size(),
+                                  out);
+    // Reshape Procedure
+  } else if (inputs.size() == 1) {
+    auto reduce_A = PerformReduction<T, Context>(dev_ctx,
+                                                 *inputs[0],
+                                                 label2perms[0],
+                                                 all_labels,
+                                                 ellipsis_dims[0],
+                                                 labeltype);
+    std::vector<char> right_labels;
+    for (auto c : right) right_labels.push_back(c);
+    right_labels = union_labels(right_labels, all_labels);
+    *out = PerformTranspose<T, Context>(dev_ctx,
+                                        reduce_A,
+                                        label2perms[0],
+                                        right_labels,
+                                        broadcast_dims,
+                                        labeltype);
+    out->Resize(make_ddim(output_dims));
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "EinsumOp kernel only support len(operands) between (0, 2]. Use "
+        "opt_einsum first to convert multi-variable to binary-variable."));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index aba4a5f5fbd43..fa1f15672b903 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -360,6 +360,14 @@ struct MulGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
 };
 
+// avoid [-Wint-in-bool-context] warning
+template <>
+struct MulGradDX<bool> {
+  HOSTDEVICE bool operator()(bool x, bool y, bool out, bool dout) const {
+    return dout && y;
+  }
+};
+
 template <typename T>
 struct MulGradDX<phi::dtype::complex<T>> {
   HOSTDEVICE phi::dtype::complex<T> operator()(
@@ -383,6 +391,14 @@ struct MulGradDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
 };
 
+// avoid [-Wint-in-bool-context] warning
+template <>
+struct MulGradDY<bool> {
+  HOSTDEVICE bool operator()(bool x, bool y, bool out, bool dout) const {
+    return dout && x;
+  }
+};
+
 template <typename T>
 struct MulGradDY<phi::dtype::complex<T>> {
   HOSTDEVICE phi::dtype::complex<T> operator()(
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index b126ca9b84227..4f1e7af582c96 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -55,7 +55,9 @@ namespace phi {
                        int axis,                                     \
                        DenseTensor* out) {                           \
     std::vector<const DenseTensor*> inputs;                          \
+    inputs.reserve(2);                                               \
     std::vector<DenseTensor*> outputs;                               \
+    outputs.reserve(1);                                              \
     inputs.emplace_back(&x);                                         \
     inputs.emplace_back(&y);                                         \
     outputs.emplace_back(out);                                       \
diff --git a/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
new file mode 100644
index 0000000000000..cb02539f2e890
--- /dev/null
+++ b/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelUnshuffleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& out_grad,
+                              int downscale_factor,
+                              const std::string& data_format,
+                              DenseTensor* x_grad) {
+  auto* dout = &out_grad;
+  auto* dx = x_grad;
+  dev_ctx.template Alloc<T>(dx);
+  int factor = downscale_factor;
+  bool channel_last = (data_format == "NHWC");
+  auto do_dims = dout->dims();
+  auto dx_dims = dx->dims();
+
+  DenseTensor t(*dout);
+  if (!channel_last) {
+    t.Resize({do_dims[0], dx_dims[1], factor, factor, do_dims[2], do_dims[3]});
+  } else {
+    t.Resize({do_dims[0], do_dims[1], do_dims[2], dx_dims[3], factor, factor});
+  }
+  std::vector<int> axis = {0, 1, 4, 2, 5, 3};
+
+  DenseTensor o(*dx);
+  if (!channel_last) {
+    o.Resize({do_dims[0], dx_dims[1], do_dims[2], factor, do_dims[3], factor});
+  } else {
+    o.Resize({do_dims[0], do_dims[1], factor, do_dims[2], factor, dx_dims[3]});
+  }
+  phi::funcs::Transpose<Context, T, 6> trans;
+  trans(dev_ctx, t, &o, axis);
+  dx->Resize(dx_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
new file mode 100644
index 0000000000000..0a140b270ba1b
--- /dev/null
+++ b/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelUnshuffleKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          int downscale_factor,
+                          const std::string& data_format,
+                          DenseTensor* out) {
+  auto* in = &x;
+  dev_ctx.template Alloc<T>(out);
+  int factor = downscale_factor;
+  bool channel_last = (data_format == "NHWC");
+  auto in_dims = in->dims();
+  auto o_dims = out->dims();
+
+  DenseTensor t(*in);
+  if (!channel_last) {
+    t.Resize({in_dims[0], in_dims[1], o_dims[2], factor, o_dims[3], factor});
+  } else {
+    t.Resize({in_dims[0], o_dims[1], factor, o_dims[2], factor, in_dims[3]});
+  }
+  std::vector<int> axis = {0, 1, 3, 5, 2, 4};
+
+  DenseTensor o(*out);
+  if (!channel_last) {
+    o.Resize({in_dims[0], in_dims[1], factor, factor, o_dims[2], o_dims[3]});
+  } else {
+    o.Resize({in_dims[0], o_dims[1], o_dims[2], in_dims[3], factor, factor});
+  }
+  phi::funcs::Transpose<Context, T, 6> trans;
+  trans(dev_ctx, t, &o, axis);
+  out->Resize(o_dims);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index b5532c614314f..8f7d45771d9d0 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -36,6 +36,7 @@ void AddKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(add, KPS, ALL_LAYOUT, phi::AddKernel, float) {}
 PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
 #else
 
diff --git a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
index 852babe29dbf7..827c478de9775 100644
--- a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
@@ -37,6 +37,7 @@ void DivideKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(divide, KPS, ALL_LAYOUT, phi::DivideKernel, float) {}
 PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
 #else
 
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 5ccd3b1a48210..821fda52ab102 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -24,24 +24,65 @@ namespace phi {
 
 // Create the definition of Maximum
 DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
+template <typename T, typename Context>
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Minimum
 DEFINE_CUDA_ELEMENTWISE_OP(Minimum)
+template <typename T, typename Context>
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Modulo
 DEFINE_CUDA_ELEMENTWISE_OP(Modulo)
 // Create the definition of FloorDivide
 DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide)
+template <typename T, typename Context>
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  int axis = -1;
+  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Pow
 DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
+template <typename T, typename Context>
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  int axis = -1;
+  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {}
 PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) {
 }
+PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {}
 PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) {
 }
+PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) {
+}
 PD_REGISTER_KERNEL(
     floor_divide_raw, KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int) {}
+PD_REGISTER_KERNEL(
+    elementwise_pow, KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float) {}
+PD_REGISTER_KERNEL(
+    elementwise_pow_raw, KPS, ALL_LAYOUT, phi::ElementwisePowRawKernel, float) {
+}
 
 #else
 using float16 = phi::dtype::float16;
diff --git a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
index 8bede0198c2fa..99408ff214268 100644
--- a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
@@ -37,6 +37,7 @@ void MultiplyKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(multiply, KPS, ALL_LAYOUT, phi::MultiplyKernel, float) {}
 PD_REGISTER_KERNEL(
     multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
 #else
diff --git a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
index 757dedb99c931..b99f687b59f4e 100644
--- a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
@@ -37,6 +37,7 @@ void SubtractKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(subtract, KPS, ALL_LAYOUT, phi::SubtractKernel, float) {}
 PD_REGISTER_KERNEL(
     subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
 #else
diff --git a/paddle/phi/kernels/kps/logical_kernel.cu b/paddle/phi/kernels/kps/logical_kernel.cu
index b732d371ad1ef..815675953953d 100644
--- a/paddle/phi/kernels/kps/logical_kernel.cu
+++ b/paddle/phi/kernels/kps/logical_kernel.cu
@@ -65,9 +65,9 @@ void LogicalNotKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_XPU_KP
 PD_REGISTER_KERNEL(logical_and, KPS, ALL_LAYOUT, phi::LogicalAndKernel, int) {}
-PD_REGISTER_KERNEL(logical_Or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {}
-PD_REGISTER_KERNEL(logical_Not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {}
-PD_REGISTER_KERNEL(logical_Xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {}
+PD_REGISTER_KERNEL(logical_or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {}
+PD_REGISTER_KERNEL(logical_not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {}
+PD_REGISTER_KERNEL(logical_xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {}
 #else
 #define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \
   PD_REGISTER_KERNEL(logical_and,                            \
diff --git a/paddle/phi/kernels/kps/reduce_sum_kernel.cu b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
index 6c039897ddd30..e800e4685ec04 100644
--- a/paddle/phi/kernels/kps/reduce_sum_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
@@ -27,6 +27,9 @@ void SumRawKernel(const Context& dev_ctx,
                   bool reduce_all,
                   DataType out_dtype,
                   DenseTensor* out) {
+  if (out_dtype == DataType::UNDEFINED && out->dtype() != x.dtype()) {
+    out_dtype = out->dtype();
+  }
   phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
diff --git a/paddle/phi/kernels/one_hot_kernel.cc b/paddle/phi/kernels/one_hot_kernel.cc
index 633f48cbb62ac..755e06752509a 100644
--- a/paddle/phi/kernels/one_hot_kernel.cc
+++ b/paddle/phi/kernels/one_hot_kernel.cc
@@ -24,9 +24,8 @@ void OneHotKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const Scalar& num_classes_s,
                   DenseTensor* out) {
-  int num_classes = num_classes_s.to<int>();
   OneHotRawKernel<T>(
-      dev_ctx, x, num_classes, phi::DataType::FLOAT32, false, out);
+      dev_ctx, x, num_classes_s, phi::DataType::FLOAT32, false, out);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/one_hot_kernel.h b/paddle/phi/kernels/one_hot_kernel.h
index 9f89609ea6336..79af88473b278 100644
--- a/paddle/phi/kernels/one_hot_kernel.h
+++ b/paddle/phi/kernels/one_hot_kernel.h
@@ -28,7 +28,7 @@ void OneHotKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void OneHotRawKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                     int32_t depth,
+                     const Scalar& depth,
                      DataType dtype,
                      bool allow_out_of_range,
                      DenseTensor* out);
diff --git a/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h b/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h
new file mode 100644
index 0000000000000..868633e56be50
--- /dev/null
+++ b/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelUnshuffleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& out_grad,
+                              int downscale_factor,
+                              const std::string& data_format,
+                              DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pixel_unshuffle_kernel.h b/paddle/phi/kernels/pixel_unshuffle_kernel.h
new file mode 100644
index 0000000000000..179e2b6639f9e
--- /dev/null
+++ b/paddle/phi/kernels/pixel_unshuffle_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PixelUnshuffleKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          int downscale_factor,
+                          const std::string& data_format,
+                          DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
old mode 100755
new mode 100644
index b01e0474f2d02..fdcbb5ec9cc8d
--- a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
@@ -124,7 +124,8 @@ struct MaxFunctor {
  */
 template <typename T>
 struct AddFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
+  inline T initial() { /*return static_cast<T>(0.0f);*/
+  }
 
   __device__ T operator()(const T a, const T b) const { return b + a; }
 };
@@ -134,7 +135,8 @@ struct AddFunctor {
  */
 template <typename T>
 struct MulFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
+  inline T initial() { /*return static_cast<T>(1.0f);*/
+  }
 
   __device__ T operator()(const T& a, const T& b) const { return b * a; }
 };
@@ -144,7 +146,8 @@ struct MulFunctor {
  */
 template <typename T>
 struct LogicalOrFunctor {
-  inline T initial() { return static_cast<T>(false); }
+  inline T initial() { /*return static_cast<T>(false);*/
+  }
 
   __device__ T operator()(const T& a, const T& b) const { return b || a; }
 };
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index 0ec8b808ba838..0e5714b174361 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -39,7 +39,7 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
       phi::errors::InvalidArgument("the input x and mask must have the shape"));
   const DenseTensor& indices = mask.non_zero_indices();
   const DenseTensor& values = mask.non_zero_elements();
-  int sparse_dim = indices.dims().size();
+  const int sparse_dim = mask.sparse_dim();
 
   DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
   DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
@@ -95,7 +95,7 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
       2,
       phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
 
-  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  const int32_t sparse_dim = x.sparse_dim();
 
   std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz()),
       mask_indexs(mask_indices.dims()[1]);
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
index 78b6354f44f9e..71a0095395552 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -50,7 +50,7 @@ void MaxPoolGradCPUKernel(const CPUContext& dev_ctx,
   DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
   x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
   T* x_grad_ptr = x_grad_values.data<T>();
-  memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel());
+  memset(x_grad_ptr, 0, sizeof(T) * x_grad_values.numel());
   phi::Copy<CPUContext>(dev_ctx,
                         x.non_zero_indices(),
                         dev_ctx.GetPlace(),
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 0499371a4dd17..69ac0417f763d 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -254,11 +254,13 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
   if (indices_dims.size() == 1) {
     sparse_dim = 1;
   }
-  const int64_t dense_dim = values.dims().size() - 1;
+  const int64_t dense_dim = x.dense_dim();
 
-  const auto place = dev_ctx.GetPlace();
   const T* x_data = values.data<T>();
-  T* out_data = out->mutable_data<T>(place);
+  *out = phi::Empty(
+      dev_ctx,
+      DenseTensorMeta(x.dtype(), x.dims(), x.non_zero_elements().layout()));
+  T* out_data = out->data<T>();
   int64_t base_offset = 1;
   for (int64_t i = 0; i < dense_dim; i++) {
     base_offset *= dense_dims[sparse_dim + i];
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 4253845956ea7..81c63c48ebff2 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -42,7 +42,7 @@ __global__ void MaskKernel(const T* x_ptr,
     int64_t col_i = i - out_i * cols;
     int64_t index = 0;
     for (int j = 0; j < sparse_dim; j++) {
-      index += indices_ptr[j * non_zero_num + i] * sparse_offsets[j];
+      index += indices_ptr[j * non_zero_num + out_i] * sparse_offsets[j];
     }
     out_values_ptr[out_i * cols + col_i] = x_ptr[index * cols + col_i];
   }
@@ -60,16 +60,13 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
       phi::errors::InvalidArgument("the input x and mask must have the shape"));
   const DenseTensor& indices = mask.non_zero_indices();
   const DenseTensor& values = mask.non_zero_elements();
-  int sparse_dim = indices.dims().size();
+  const int sparse_dim = mask.sparse_dim();
   DenseTensor sparse_offsets = phi::Empty<GPUContext>(
       dev_ctx,
       DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW));
   std::vector<int64_t> h_sparse_offsets(sparse_dim);
-  int64_t offset = 1;
-  for (int i = sparse_dim - 1; i >= 0; i--) {
-    h_sparse_offsets[i] = offset;
-    offset *= dims[i];
-  }
+  phi::funcs::sparse::CalcOffsetsPerDim(
+      dims, sparse_dim, h_sparse_offsets.data());
 
   phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
                                      &h_sparse_offsets[0],
@@ -151,7 +148,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
       2,
       phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
 
-  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  const int32_t sparse_dim = x.sparse_dim();
   auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
 
   std::vector<IntT> sparse_offsets(sparse_dim);
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
index bd862a44afeeb..c22e67eef6712 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -64,7 +64,7 @@ void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
   int rulebook_len = rulebook.dims()[1];
   const IntT* rulebook_ptr = rulebook.data<IntT>();
   std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
-      h_counter(kernel_size);
+      h_counter(rulebook_len, 0);
   phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
                                      rulebook_ptr,
                                      rulebook_len * sizeof(IntT),
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
index b76b61f83bfc9..e3eb7aa24331d 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
@@ -104,7 +104,7 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx,
 #endif
                out_features_ptr,
                out_features_ptr + out->non_zero_elements().numel(),
-               static_cast<T>(-FLT_MAX));
+               static_cast<T>(0));
   // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
   for (int i = 0; i < kernel_size; i++) {
     if (counter[i] <= 0) {
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 0b6ac1aed0147..960d7eab26463 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -503,7 +503,10 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
 
   const auto place = dev_ctx.GetPlace();
   const T* x_data = values.data<T>();
-  T* out_data = out->mutable_data<T>(place);
+  *out = phi::Empty(dev_ctx,
+                    phi::DenseTensorMeta(
+                        x.dtype(), x.dims(), x.non_zero_elements().layout()));
+  T* out_data = out->data<T>();
   int64_t base_offset = 1;
   for (int64_t i = 0; i < dense_dim; i++) {
     base_offset *= dense_dims[sparse_dim + i];
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 072e6f141f8f1..d39790fcea5e3 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -110,7 +110,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 DenseTensor SparseCooToDense(const Context& dev_ctx, const SparseCooTensor& x) {
-  DenseTensorMeta meta(x.dtype(), x.dims(), x.layout());
+  DenseTensorMeta meta(x.dtype(), x.dims(), x.non_zero_elements().layout());
   DenseTensor dense = phi::Empty(dev_ctx, std::move(meta));
   SparseCooToDenseKernel<T, Context>(dev_ctx, x, &dense);
   return dense;
@@ -129,7 +129,7 @@ void SparseCsrToDenseKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 DenseTensor SparseCsrToDense(const Context& dev_ctx, const SparseCsrTensor& x) {
-  DenseTensorMeta meta(x.dtype(), x.dims(), x.layout());
+  DenseTensorMeta meta(x.dtype(), x.dims(), x.non_zero_elements().layout());
   DenseTensor dense = phi::Empty(dev_ctx, std::move(meta));
   SparseCsrToDenseKernel<T, Context>(dev_ctx, x, &dense);
   return dense;
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index 5900b49946623..157eaa279debb 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -121,13 +121,13 @@ KernelSignature ReluDoubleGradOpArgumentMapping(
 KernelSignature TanhDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+      "tanh_double_grad", {"Out", "DOut", "DDX"}, {}, {"DOutNew", "DDOut"});
 }
 
 KernelSignature TanhTripleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("tanh_triple_grad",
-                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {"Out", "DOut", "DDX", "D_DOut_New", "D_DDOut"},
                          {},
                          {"D_OutNew", "D_DOut", "D_DDx"});
 }
diff --git a/paddle/phi/ops/compat/adam_sig.cc b/paddle/phi/ops/compat/adam_sig.cc
index 958538cd7dfc2..f3e7eeb6b6762 100644
--- a/paddle/phi/ops/compat/adam_sig.cc
+++ b/paddle/phi/ops/compat/adam_sig.cc
@@ -19,22 +19,22 @@
 namespace phi {
 
 KernelSignature AdamOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  paddle::SmallVector<const char*> in_names = {"Param",
-                                               "Grad",
-                                               "LearningRate",
-                                               "Moment1",
-                                               "Moment2",
-                                               "Beta1Pow",
-                                               "Beta2Pow",
-                                               "MasterParam",
-                                               "SkipUpdate"};
-  paddle::SmallVector<const char*> out_names = {"ParamOut",
-                                                "Moment1Out",
-                                                "Moment2Out",
-                                                "Beta1PowOut",
-                                                "Beta2PowOut",
-                                                "MasterParamOut"};
-  paddle::SmallVector<const char*> attr_names;
+  paddle::small_vector<const char*> in_names = {"Param",
+                                                "Grad",
+                                                "LearningRate",
+                                                "Moment1",
+                                                "Moment2",
+                                                "Beta1Pow",
+                                                "Beta2Pow",
+                                                "MasterParam",
+                                                "SkipUpdate"};
+  paddle::small_vector<const char*> out_names = {"ParamOut",
+                                                 "Moment1Out",
+                                                 "Moment2Out",
+                                                 "Beta1PowOut",
+                                                 "Beta2PowOut",
+                                                 "MasterParamOut"};
+  paddle::small_vector<const char*> attr_names;
 
   attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor"
                                                       : "beta1");
diff --git a/paddle/phi/ops/compat/adamw_sig.cc b/paddle/phi/ops/compat/adamw_sig.cc
index e417aa30ba493..b4cf6f3cbbe6d 100644
--- a/paddle/phi/ops/compat/adamw_sig.cc
+++ b/paddle/phi/ops/compat/adamw_sig.cc
@@ -19,22 +19,22 @@
 namespace phi {
 
 KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  paddle::SmallVector<const char*> in_names = {"Param",
-                                               "Grad",
-                                               "LearningRate",
-                                               "Moment1",
-                                               "Moment2",
-                                               "Beta1Pow",
-                                               "Beta2Pow",
-                                               "MasterParam",
-                                               "SkipUpdate"};
-  paddle::SmallVector<const char*> out_names = {"ParamOut",
-                                                "Moment1Out",
-                                                "Moment2Out",
-                                                "Beta1PowOut",
-                                                "Beta2PowOut",
-                                                "MasterParamOut"};
-  paddle::SmallVector<const char*> attr_names;
+  paddle::small_vector<const char*> in_names = {"Param",
+                                                "Grad",
+                                                "LearningRate",
+                                                "Moment1",
+                                                "Moment2",
+                                                "Beta1Pow",
+                                                "Beta2Pow",
+                                                "MasterParam",
+                                                "SkipUpdate"};
+  paddle::small_vector<const char*> out_names = {"ParamOut",
+                                                 "Moment1Out",
+                                                 "Moment2Out",
+                                                 "Beta1PowOut",
+                                                 "Beta2PowOut",
+                                                 "MasterParamOut"};
+  paddle::small_vector<const char*> attr_names;
 
   attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor"
                                                       : "beta1");
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc
index 14affe60b9d55..1c6b63d70c705 100644
--- a/paddle/phi/ops/compat/batch_norm_sig.cc
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -82,16 +82,16 @@ KernelSignature BatchNormGradOpArgumentMapping(
 KernelSignature BatchNormGradGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("batch_norm_grad_grad",
-                         {"DDX",
-                          "DDScale",
-                          "DDBias",
-                          "DY",
-                          "X",
+                         {"X",
                           "Scale",
+                          "Mean",
+                          "Variance",
                           "SavedMean",
                           "SavedVariance",
-                          "Mean",
-                          "Variance"},
+                          "DY",
+                          "DDX",
+                          "DDScale",
+                          "DDBias"},
                          {"momentum",
                           "epsilon",
                           "data_layout",
diff --git a/paddle/phi/ops/compat/clip_sig.cc b/paddle/phi/ops/compat/clip_sig.cc
index 25a34f2b9c89f..889dbf6ef9f79 100644
--- a/paddle/phi/ops/compat/clip_sig.cc
+++ b/paddle/phi/ops/compat/clip_sig.cc
@@ -18,7 +18,7 @@
 namespace phi {
 
 KernelSignature ClipOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  paddle::SmallVector<std::string, kAttrSmallVectorSize> attr_names;
+  paddle::small_vector<std::string, kAttrSmallVectorSize> attr_names;
   attr_names.emplace_back(ctx.HasInput("Min") ? "Min" : "min");
   attr_names.emplace_back(ctx.HasInput("Max") ? "Max" : "max");
   if (ctx.IsDenseTensorInput("X")) {
diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc
new file mode 100644
index 0000000000000..0b3cc3425df45
--- /dev/null
+++ b/paddle/phi/ops/compat/einsum_sig.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("einsum", {"Operands"}, {"equation"}, {"Out"});
+}
+
+KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("einsum_grad",
+                         {"Operands", {"Out@GRAD"}},
+                         {"equation"},
+                         {{"Operands@GRAD"}});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(einsum, phi::EinsumOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(einsum_grad, phi::EinsumGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index 19110eb0e0ab8..13a5a6fd4a449 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -133,7 +133,7 @@ KernelSignature ElementwiseSubGradOpArgumentMapping(
 KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
+      "subtract_double_grad", {"Y", "DOut", "DDX", "DDY"}, {"axis"}, {"DDOut"});
 }
 
 KernelSignature ElementwiseDivGradOpArgumentMapping(
diff --git a/paddle/phi/ops/compat/pixel_unshuffle_sig.cc b/paddle/phi/ops/compat/pixel_unshuffle_sig.cc
new file mode 100644
index 0000000000000..817dc1a228877
--- /dev/null
+++ b/paddle/phi/ops/compat/pixel_unshuffle_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PixelUnshuffleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("pixel_unshuffle_grad",
+                         {"Out@GRAD"},
+                         {"downscale_factor", "data_format"},
+                         {"X@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(pixel_unshuffle_grad,
+                           phi::PixelUnshuffleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/strided_slice_sig.cc b/paddle/phi/ops/compat/strided_slice_sig.cc
index 5421fcd616ce7..02b3914787866 100644
--- a/paddle/phi/ops/compat/strided_slice_sig.cc
+++ b/paddle/phi/ops/compat/strided_slice_sig.cc
@@ -48,14 +48,14 @@ KernelSignature StridedSliceOpArgumentMapping(
                  ? (use_attr_strides ? "strides" : "StridesTensorList")
                  : "strides");
 
-  paddle::SmallVector<const char*> inputs = {"Input"};
-  paddle::SmallVector<const char*> attrs = {"axes",
-                                            starts_key,
-                                            ends_key,
-                                            strides_key,
-                                            "infer_flags",
-                                            "decrease_axis"};
-  paddle::SmallVector<const char*> outputs = {"Out"};
+  paddle::small_vector<const char*> inputs = {"Input"};
+  paddle::small_vector<const char*> attrs = {"axes",
+                                             starts_key,
+                                             ends_key,
+                                             strides_key,
+                                             "infer_flags",
+                                             "decrease_axis"};
+  paddle::small_vector<const char*> outputs = {"Out"};
 
   const char* kernel_name;
   if (ctx.IsDenseTensorVectorInput("Input")) {
@@ -97,14 +97,14 @@ KernelSignature StridedSliceGradOpArgumentMapping(
                  ? (use_attr_strides ? "strides" : "StridesTensorList")
                  : "strides");
 
-  paddle::SmallVector<const char*> inputs = {"Input", "Out@GRAD"};
-  paddle::SmallVector<const char*> attrs = {"axes",
-                                            starts_key,
-                                            ends_key,
-                                            strides_key,
-                                            "infer_flags",
-                                            "decrease_axis"};
-  paddle::SmallVector<const char*> outputs = {"Input@GRAD"};
+  paddle::small_vector<const char*> inputs = {"Input", "Out@GRAD"};
+  paddle::small_vector<const char*> attrs = {"axes",
+                                             starts_key,
+                                             ends_key,
+                                             strides_key,
+                                             "infer_flags",
+                                             "decrease_axis"};
+  paddle::small_vector<const char*> outputs = {"Input@GRAD"};
 
   const char* kernel_name;
   if (ctx.IsDenseTensorVectorInput("Input")) {
diff --git a/paddle/phi/ops/compat/sum_sig.cc b/paddle/phi/ops/compat/sum_sig.cc
index 4364047b0e61b..d71111408f854 100644
--- a/paddle/phi/ops/compat/sum_sig.cc
+++ b/paddle/phi/ops/compat/sum_sig.cc
@@ -18,7 +18,7 @@
 namespace phi {
 
 KernelSignature SumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("X")) {
+  if (ctx.IsDenseTensorInputs("X")) {
     return KernelSignature("add_n", {"X"}, {}, {"Out"});
   }
   return KernelSignature("unregistered", {}, {}, {});
diff --git a/paddle/phi/tests/core/test_kernel_factory.cc b/paddle/phi/tests/core/test_kernel_factory.cc
index cb4b50f5b6c3d..490d4967eeba2 100644
--- a/paddle/phi/tests/core/test_kernel_factory.cc
+++ b/paddle/phi/tests/core/test_kernel_factory.cc
@@ -73,6 +73,67 @@ TEST(KernelRegistry, SetFP32Input) {
   EXPECT_EQ(output_defs.at(0).dtype, phi::DataType::FLOAT16);
 }
 
+TEST(AttributeType, OStream) {
+  std::ostringstream oss;
+  oss << phi::AttributeType::UNDEFINED;
+  EXPECT_EQ(oss.str(), "Undefined");
+  oss.str("");
+  oss << phi::AttributeType::BOOL;
+  EXPECT_EQ(oss.str(), "bool");
+  oss.str("");
+  oss << phi::AttributeType::INT32;
+  EXPECT_EQ(oss.str(), "int");
+  oss.str("");
+  oss << phi::AttributeType::INT64;
+  EXPECT_EQ(oss.str(), "int64_t");
+  oss.str("");
+  oss << phi::AttributeType::FLOAT32;
+  EXPECT_EQ(oss.str(), "float");
+  oss.str("");
+  oss << phi::AttributeType::FLOAT64;
+  EXPECT_EQ(oss.str(), "double");
+  oss.str("");
+  oss << phi::AttributeType::STRING;
+  EXPECT_EQ(oss.str(), "string");
+  oss.str("");
+  oss << phi::AttributeType::BOOLS;
+  EXPECT_EQ(oss.str(), "vector<bool>");
+  oss.str("");
+  oss << phi::AttributeType::INT32S;
+  EXPECT_EQ(oss.str(), "vector<int>");
+  oss.str("");
+  oss << phi::AttributeType::INT64S;
+  EXPECT_EQ(oss.str(), "vector<int64_t>");
+  oss.str("");
+  oss << phi::AttributeType::FLOAT32S;
+  EXPECT_EQ(oss.str(), "vector<float>");
+  oss.str("");
+  oss << phi::AttributeType::FLOAT64S;
+  EXPECT_EQ(oss.str(), "vector<double>");
+  oss.str("");
+  oss << phi::AttributeType::STRINGS;
+  EXPECT_EQ(oss.str(), "vector<string>");
+  oss.str("");
+  oss << phi::AttributeType::SCALAR;
+  EXPECT_EQ(oss.str(), "Scalar");
+  oss.str("");
+  oss << phi::AttributeType::SCALARS;
+  EXPECT_EQ(oss.str(), "vector<Scalar>");
+  oss.str("");
+  oss << phi::AttributeType::INT_ARRAY;
+  EXPECT_EQ(oss.str(), "IntArray");
+  oss.str("");
+  oss << phi::AttributeType::DATA_TYPE;
+  EXPECT_EQ(oss.str(), "DataType");
+  oss.str("");
+  oss << phi::AttributeType::DATA_LAYOUT;
+  EXPECT_EQ(oss.str(), "DataLayout");
+  oss.str("");
+  oss << phi::AttributeType::PLACE;
+  EXPECT_EQ(oss.str(), "Place");
+  oss.str("");
+}
+
 }  // namespace tests
 }  // namespace phi
 
diff --git a/paddle/phi/tests/core/test_meta_fn_utils.cc b/paddle/phi/tests/core/test_meta_fn_utils.cc
index 07832494d50ec..afdd3bc0d9ad0 100644
--- a/paddle/phi/tests/core/test_meta_fn_utils.cc
+++ b/paddle/phi/tests/core/test_meta_fn_utils.cc
@@ -68,7 +68,7 @@ TEST(MetaFnFactory, SplitInferMetaFn) {
 
   phi::DenseTensor dense_out1;
   phi::DenseTensor dense_out2;
-  paddle::SmallVector<phi::MetaTensor, kOutputSmallVectorSize> out;
+  paddle::small_vector<phi::MetaTensor, kOutputSmallVectorSize> out;
   out.emplace_back(phi::MetaTensor(&dense_out1));
   out.emplace_back(phi::MetaTensor(&dense_out2));
 
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 4a84793527ea7..1535f40b70072 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -68,6 +68,10 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return dense_tensor_inputs.count(name) > 0;
   }
 
+  bool IsDenseTensorInputs(const std::string& name) const override {
+    return dense_tensor_inputs.count(name) > 0;
+  }
+
   bool IsSelectedRowsInput(const std::string& name) const override {
     return selected_rows_inputs.count(name) > 0;
   }
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2e2efa65d7007..0e1d0660322bd 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -132,6 +132,18 @@ function cmake_base() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp310-cp310" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.10" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.10/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.10/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.10/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.10/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.10/include/python3.10/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.10/lib/libpython3.10.dylib"
+                pip3.10 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
         fi
     else
         if [ "$1" != "" ]; then
@@ -164,6 +176,13 @@ function cmake_base() {
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.9.0/include/python3.9
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.9.0/lib/libpython3.so"
                 pip3.9 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp310-cp310" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.10.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.10.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.10.0/bin/python3.10
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.10.0/include/python3.10
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.10.0/lib/libpython3.so"
+                pip3.10 install -r ${PADDLE_ROOT}/python/requirements.txt
            elif [ "$1" == "conda-python3.7" ]; then
                 export LD_LIBRARY_PATH=/opt/conda/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/conda/bin/:${PATH}
@@ -612,6 +631,8 @@ EOF
             pip3.8 uninstall -y paddlepaddle
         elif [ "$1" == "cp39-cp39" ]; then
             pip3.9 uninstall -y paddlepaddle
+        elif [ "$1" == "cp310-cp310" ]; then
+            pip3.10 uninstall -y paddlepaddle
         fi
         set -ex
 
@@ -627,6 +648,9 @@ EOF
         elif [ "$1" == "cp39-cp39" ]; then
             pip3.9 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
             pip3.9 install --user hypothesis
+        elif [ "$1" == "cp310-cp310" ]; then
+            pip3.10 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+            pip3.10 install --user hypothesis
         fi
         tmpfile_rand=`date +%s%N`
         tmpfile=$tmp_dir/$tmpfile_rand
@@ -728,6 +752,8 @@ function run_linux_cpu_test() {
     pip install hypothesis
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python
+    cp -r ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/white_list ${PADDLE_ROOT}/build/python
     ut_total_startTime_s=`date +%s`
     if [ ${WITH_TESTING:-ON} == "ON" ] ; then
     cat <<EOF
@@ -933,7 +959,7 @@ function check_whl_size() {
 
     whldiffSize=`echo $(($pr_whl_size - $dev_whl_size))`
     if [ ${whldiffSize} -gt 10 ]; then
-       approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+       approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22334008 22361972`
        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
        if [ "${APPROVALS}" == "FALSE" ]; then
@@ -1027,13 +1053,13 @@ function generate_api_spec() {
 
 function check_approvals_of_unittest() {
     set +x
-    if [ "$GITHUB_API_TOKEN" == "" ] || [ "$GIT_PR_ID" == "" ]; then
+    if [ "$GIT_PR_ID" == "" ]; then
         return 0
     fi
     # approval_user_list: XiaoguangHu01 46782768,luotao1 6836917,phlrain 43953930,lanxianghit 47554610, zhouwei25 52485244, kolinwei 22165420
     check_times=$1
     if [ $check_times == 1 ]; then
-        approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+        approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
         if [ "${approval_line}" != "" ]; then
             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244`
             echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
@@ -1047,7 +1073,7 @@ function check_approvals_of_unittest() {
     elif [ $check_times == 2 ]; then
         unittest_spec_diff=`python ${PADDLE_ROOT}/tools/diff_unittest.py ${PADDLE_ROOT}/paddle/fluid/UNITTEST_DEV.spec ${PADDLE_ROOT}/paddle/fluid/UNITTEST_PR.spec`
         if [ "$unittest_spec_diff" != "" ]; then
-            approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+            approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955`
             echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
             if [ "${APPROVALS}" == "FALSE" ]; then
@@ -1080,7 +1106,7 @@ function check_approvals_of_unittest() {
 EOF
         if [ `echo "20 < $AllDiffSize"|bc` -eq 1 ] ; then
             
-            approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+            approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 39303645 328693`
             echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
             if [ "${APPROVALS}" == "FALSE" ]; then
@@ -1791,7 +1817,14 @@ function precise_card_test() {
     echo "****************************************************************"
     
     tmpfile=$tmp_dir/$testcases".log"
+    tmpfile1=$tmp_dir/$testcases"-gpu.log"
+    nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 10 > $tmpfile1 2>&1 &
+    gpu_memory_pid=$!
     env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I 0,,1 -R "($testcases)" --timeout 500 --output-on-failure -V -j 1 > $tmpfile 
+    kill ${gpu_memory_pid}
+    cat $tmpfile1 | tr -d ' MiB' | awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY_USE=", max}' >> $tmpfile 
+    cat $tmpfile1 | tr -d ' MiB' | awk 'BEGIN {sum = 0} {if(NR>1){sum = sum + $1 }} END {print "AVG_GPU_MEMORY_USE=", sum / (NR-2)}' >> $tmpfile 
+    rm -rf $tmpfile1
     set +m
 }
 
@@ -1883,8 +1916,11 @@ set -x
     python ${PADDLE_ROOT}/tools/pyCov_multithreading.py ${PADDLE_ROOT}
     wait;
 
-    #generate ut map
+    #generate ut file map
     python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_ut_map' ${PADDLE_ROOT}
+
+    #generate ut mem map
+    python ${PADDLE_ROOT}/tools/get_ut_mem_map.py $tmp_dir 
 }
 
 function get_failedUts_precise_map_file {
@@ -2380,6 +2416,8 @@ function parallel_test() {
     pip install hypothesis
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python
+    cp -r ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/white_list ${PADDLE_ROOT}/build/python
     ut_total_startTime_s=`date +%s`
     if [ "$WITH_CINN" == "ON" ];then
         parallel_test_base_cinn
@@ -2493,21 +2531,25 @@ EOF
     ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
     ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
+    ref_paddle310=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp310-cp310-linux_x86_64.whl
 
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
     ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
+    ref_paddle310_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp310-cp310-linux_x86_64.whl
 
     if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
         ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
         ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
+        ref_paddle310=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp310-cp310-linux_x86_64.whl
         ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
         ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
+        ref_paddle310_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp310-cp310-linux_x86_64.whl
     fi
 
     ref_paddle36_mv1=""
@@ -2620,6 +2662,22 @@ EOF
         apt-get clean -y && \
         rm -f ${ref_paddle39} && \
         ldconfig
+EOF
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.10.0/Python-3.10.0.tgz && \
+        tar -xzf Python-3.10.0.tgz && cd Python-3.10.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.10.0.tgz
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
+        wget ${ref_web}/${ref_paddle310} && pip3.10 install ${ref_paddle310_whl}; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f ${ref_paddle310} && \
+        ldconfig
 EOF
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # run paddle version to install python packages first
@@ -2795,7 +2853,7 @@ function test_op_benchmark() {
     # The PR will pass quickly when get approval from specific person.
     # Xreki 12538138, luotao1 6836917, ZzSean 32410583
     set +x
-    approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
+    approval_line=$(curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
     if [ "${approval_line}" != "" ]; then
         APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
@@ -2847,11 +2905,11 @@ function summary_check_problems() {
 
 function reuse_so_cache() {
     get_html="https://api.github.com/repos/PaddlePaddle/Paddle"
-    curl -X GET ${get_html}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
+    curl -X GET ${get_html}/commits >tmp.txt
     merge_commit=`grep "sha" tmp.txt| awk -F \" 'NR==1{print $(NF-1)}'| sed 's# ##g'`
-    curl -X GET ${get_html}/commits/${merge_commit} -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
+    curl -X GET ${get_html}/commits/${merge_commit} >tmp.txt
     merge_pr=`grep -oP -m 1 '(#[0-9]*)' tmp.txt| sed 's/#//g'`
-    curl -X GET ${get_html}/pulls/${merge_pr}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
+    curl -X GET ${get_html}/pulls/${merge_pr}/commits >tmp.txt
     pr_commit=`grep "sha" tmp.txt |tail -3|head -1|awk -F : '{print $NF}'|sed 's#"##g'|sed 's#,##g'| sed 's# ##g'`
     set +e
     wget -q https://xly-devops.bj.bcebos.com/PR/Paddle/${merge_pr}/${pr_commit}/workspace/Paddle/build/proto_so.tar.gz
@@ -2945,7 +3003,7 @@ function check_coverage_build() {
 
     set +x
     if [ ${diff_coverage_build_size} -gt 3 ]; then
-       approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+       approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 29832297 6836917 43953930`
        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
        if [ "${APPROVALS}" == "FALSE" ]; then
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 2c977e923b5b1..f5cfd14e6b84c 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,5 +1,11 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc phi_utils)
+  set(paddle_gtest_main_deps device_context gtest gflags init memory phi_utils proto_desc)
+
+  if (WITH_GPU OR WITH_ROCM)
+    list(APPEND paddle_gtest_main_deps gpu_info)
+  endif()
+
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS ${paddle_gtest_main_deps})
 endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index bb919f0e9110c..16c683e39fa8c 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -20,6 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/init.h"
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DECLARE_bool(enable_gpu_memory_usage_log);
+#endif
+
 int main(int argc, char** argv) {
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
   testing::InitGoogleTest(&argc, argv);
@@ -81,6 +85,13 @@ int main(int argc, char** argv) {
     VLOG(1) << "gtest undefok_string:" << undefok_string;
   }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (strstr(undefok_str, "enable_gpu_memory_usage_log")) {
+    VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true";
+    FLAGS_enable_gpu_memory_usage_log = true;
+  }
+#endif
+
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
   ::GFLAGS_NAMESPACE::ParseCommandLineFlags(
diff --git a/paddle/utils/array_ref.h b/paddle/utils/array_ref.h
index 788710925936b..6731ad80e9350 100644
--- a/paddle/utils/array_ref.h
+++ b/paddle/utils/array_ref.h
@@ -3,8 +3,10 @@
 // 1. remove hash_value functions
 // 2. replace with the llvm::NoneType with paddle::none_t
 // 3. remove drop_while, drop_until, take_while, take_until methods
+// 4. change ArrayRef to array_ref to unify naming style of utils
 
-//===- ArrayRef.h - Array Reference Wrapper ---------------------*- C++ -*-===//
+//===- ArrayRef.h - Array Reference Wrapper ---------------------*- C++
+//-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -29,19 +31,19 @@
 
 namespace paddle {
 
-/// ArrayRef - Represent a constant reference to an array (0 or more elements
+/// array_ref - Represent a constant reference to an array (0 or more elements
 /// consecutively in memory), i.e. a start pointer and a length.  It allows
 /// various APIs to take consecutive elements easily and conveniently.
 ///
 /// This class does not own the underlying data, it is expected to be used in
 /// situations where the data resides in some other buffer, whose lifetime
-/// extends past that of the ArrayRef. For this reason, it is not in general
-/// safe to store an ArrayRef.
+/// extends past that of the array_ref. For this reason, it is not in general
+/// safe to store an array_ref.
 ///
 /// This is intended to be trivially copyable, so it should be passed by
 /// value.
 template <typename T>
-class ArrayRef {
+class array_ref {
  public:
   using iterator = const T *;
   using const_iterator = const T *;
@@ -59,81 +61,81 @@ class ArrayRef {
   /// @name Constructors
   /// @{
 
-  /// Construct an empty ArrayRef.
-  /*implicit*/ ArrayRef() = default;
+  /// Construct an empty array_ref.
+  /*implicit*/ array_ref() = default;
 
-  /// Construct an empty ArrayRef from None.
-  /*implicit*/ ArrayRef(none_t) {}
+  /// Construct an empty array_ref from None.
+  /*implicit*/ array_ref(none_t) {}
 
-  /// Construct an ArrayRef from a single element.
-  /*implicit*/ ArrayRef(const T &OneElt) : Data(&OneElt), Length(1) {}
+  /// Construct an array_ref from a single element.
+  /*implicit*/ array_ref(const T &OneElt) : Data(&OneElt), Length(1) {}
 
-  /// Construct an ArrayRef from a pointer and length.
-  /*implicit*/ ArrayRef(const T *data, size_t length)
+  /// Construct an array_ref from a pointer and length.
+  /*implicit*/ array_ref(const T *data, size_t length)
       : Data(data), Length(length) {}
 
-  /// Construct an ArrayRef from a range.
-  ArrayRef(const T *begin, const T *end) : Data(begin), Length(end - begin) {}
+  /// Construct an array_ref from a range.
+  array_ref(const T *begin, const T *end) : Data(begin), Length(end - begin) {}
 
-  /// Construct an ArrayRef from a SmallVector. This is templated in order to
-  /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
-  /// copy-construct an ArrayRef.
+  /// Construct an array_ref from a small_vector. This is templated in order to
+  /// avoid instantiating small_vector_template_common<T> whenever we
+  /// copy-construct an array_ref.
   template <typename U>
-  /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
+  /*implicit*/ array_ref(const small_vector_template_common<T, U> &Vec)
       : Data(Vec.data()), Length(Vec.size()) {}
 
-  /// Construct an ArrayRef from a std::vector.
+  /// Construct an array_ref from a std::vector.
   template <typename A>
-  /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
+  /*implicit*/ array_ref(const std::vector<T, A> &Vec)
       : Data(Vec.data()), Length(Vec.size()) {}
 
-  /// Construct an ArrayRef from a std::array
+  /// Construct an array_ref from a std::array
   template <size_t N>
-  /*implicit*/ constexpr ArrayRef(const std::array<T, N> &Arr)
+  /*implicit*/ constexpr array_ref(const std::array<T, N> &Arr)
       : Data(Arr.data()), Length(N) {}
 
-  /// Construct an ArrayRef from a C array.
+  /// Construct an array_ref from a C array.
   template <size_t N>
-  /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
+  /*implicit*/ constexpr array_ref(const T (&Arr)[N]) : Data(Arr), Length(N) {}
 
-/// Construct an ArrayRef from a std::initializer_list.
+/// Construct an array_ref from a std::initializer_list.
 #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 9
 // Disable gcc's warning in this constructor as it generates an enormous
 // amount
-// of messages. Anyone using ArrayRef should already be aware of the fact that
+// of messages. Anyone using array_ref should already be aware of the fact that
 // it does not do lifetime extension.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winit-list-lifetime"
 #endif
-  /*implicit*/ ArrayRef(const std::initializer_list<T> &Vec)
+  /*implicit*/ array_ref(const std::initializer_list<T> &Vec)
       : Data(Vec.begin() == Vec.end() ? (T *)nullptr : Vec.begin()),
         Length(Vec.size()) {}
 #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 9
 #pragma GCC diagnostic pop
 #endif
 
-  /// Construct an ArrayRef<const T*> from ArrayRef<T*>. This uses SFINAE to
+  /// Construct an array_ref<const T*> from array_ref<T*>. This uses SFINAE to
   /// ensure that only ArrayRefs of pointers can be converted.
   template <typename U>
-  ArrayRef(const ArrayRef<U *> &A,
-           std::enable_if_t<std::is_convertible<U *const *, T const *>::value>
-               * = nullptr)
+  array_ref(const array_ref<U *> &A,
+            std::enable_if_t<std::is_convertible<U *const *, T const *>::value>
+                * = nullptr)
       : Data(A.data()), Length(A.size()) {}
 
-  /// Construct an ArrayRef<const T*> from a SmallVector<T*>. This is
-  /// templated in order to avoid instantiating SmallVectorTemplateCommon<T>
-  /// whenever we copy-construct an ArrayRef.
+  /// Construct an array_ref<const T*> from a small_vector<T*>. This is
+  /// templated in order to avoid instantiating small_vector_template_common<T>
+  /// whenever we copy-construct an array_ref.
   template <typename U, typename DummyT>
-  /*implicit*/ ArrayRef(
-      const SmallVectorTemplateCommon<U *, DummyT> &Vec,
+  /*implicit*/ array_ref(
+      const small_vector_template_common<U *, DummyT> &Vec,
       std::enable_if_t<std::is_convertible<U *const *, T const *>::value> * =
           nullptr)
       : Data(Vec.data()), Length(Vec.size()) {}
 
-  /// Construct an ArrayRef<const T*> from std::vector<T*>. This uses SFINAE
+  /// Construct an array_ref<const T*> from std::vector<T*>. This uses SFINAE
   /// to ensure that only vectors of pointers can be converted.
   template <typename U, typename A>
-  ArrayRef(
+  array_ref(
       const std::vector<U *, A> &Vec,
       std::enable_if_t<std::is_convertible<U *const *, T const *>::value> * = 0)
       : Data(Vec.data()), Length(Vec.size()) {}
@@ -168,50 +170,50 @@ class ArrayRef {
     return Data[Length - 1];
   }
 
-  // copy - Allocate copy in Allocator and return ArrayRef<T> to it.
+  // copy - Allocate copy in Allocator and return array_ref<T> to it.
   template <typename Allocator>
-  ArrayRef<T> copy(Allocator &A) {
+  array_ref<T> copy(Allocator &A) {
     T *Buff = A.template Allocate<T>(Length);
     std::uninitialized_copy(begin(), end(), Buff);
-    return ArrayRef<T>(Buff, Length);
+    return array_ref<T>(Buff, Length);
   }
 
   /// equals - Check for element-wise equality.
-  bool equals(ArrayRef RHS) const {
+  bool equals(array_ref RHS) const {
     if (Length != RHS.Length) return false;
     return std::equal(begin(), end(), RHS.begin());
   }
 
   /// slice(n, m) - Chop off the first N elements of the array, and keep M
   /// elements in the array.
-  ArrayRef<T> slice(size_t N, size_t M) const {
+  array_ref<T> slice(size_t N, size_t M) const {
     assert(N + M <= size() && "Invalid specifier");
-    return ArrayRef<T>(data() + N, M);
+    return array_ref<T>(data() + N, M);
   }
 
   /// slice(n) - Chop off the first N elements of the array.
-  ArrayRef<T> slice(size_t N) const { return slice(N, size() - N); }
+  array_ref<T> slice(size_t N) const { return slice(N, size() - N); }
 
   /// Drop the first \p N elements of the array.
-  ArrayRef<T> drop_front(size_t N = 1) const {
+  array_ref<T> drop_front(size_t N = 1) const {
     assert(size() >= N && "Dropping more elements than exist");
     return slice(N, size() - N);
   }
 
   /// Drop the last \p N elements of the array.
-  ArrayRef<T> drop_back(size_t N = 1) const {
+  array_ref<T> drop_back(size_t N = 1) const {
     assert(size() >= N && "Dropping more elements than exist");
     return slice(0, size() - N);
   }
 
   /// Return a copy of *this with only the first \p N elements.
-  ArrayRef<T> take_front(size_t N = 1) const {
+  array_ref<T> take_front(size_t N = 1) const {
     if (N >= size()) return *this;
     return drop_back(size() - N);
   }
 
   /// Return a copy of *this with only the last \p N elements.
-  ArrayRef<T> take_back(size_t N = 1) const {
+  array_ref<T> take_back(size_t N = 1) const {
     if (N >= size()) return *this;
     return drop_front(size() - N);
   }
@@ -229,7 +231,7 @@ class ArrayRef {
   /// The declaration here is extra complicated so that "arrayRef = {}"
   /// continues to select the move assignment operator.
   template <typename U>
-  std::enable_if_t<std::is_same<U, T>::value, ArrayRef<T>> &operator=(
+  std::enable_if_t<std::is_same<U, T>::value, array_ref<T>> &operator=(
       U &&Temporary) = delete;
 
   /// Disallow accidental assignment from a temporary.
@@ -237,7 +239,7 @@ class ArrayRef {
   /// The declaration here is extra complicated so that "arrayRef = {}"
   /// continues to select the move assignment operator.
   template <typename U>
-  std::enable_if_t<std::is_same<U, T>::value, ArrayRef<T>> &operator=(
+  std::enable_if_t<std::is_same<U, T>::value, array_ref<T>> &operator=(
       std::initializer_list<U>) = delete;
 
   /// @}
@@ -255,90 +257,90 @@ class ArrayRef {
   /// @}
 };
 
-/// @name ArrayRef Convenience constructors
+/// @name array_ref Convenience constructors
 /// @{
 
-/// Construct an ArrayRef from a single element.
+/// Construct an array_ref from a single element.
 template <typename T>
-ArrayRef<T> makeArrayRef(const T &OneElt) {
+array_ref<T> make_array_ref(const T &OneElt) {
   return OneElt;
 }
 
-/// Construct an ArrayRef from a pointer and length.
+/// Construct an array_ref from a pointer and length.
 template <typename T>
-ArrayRef<T> makeArrayRef(const T *data, size_t length) {
-  return ArrayRef<T>(data, length);
+array_ref<T> make_array_ref(const T *data, size_t length) {
+  return array_ref<T>(data, length);
 }
 
-/// Construct an ArrayRef from a range.
+/// Construct an array_ref from a range.
 template <typename T>
-ArrayRef<T> makeArrayRef(const T *begin, const T *end) {
-  return ArrayRef<T>(begin, end);
+array_ref<T> make_array_ref(const T *begin, const T *end) {
+  return array_ref<T>(begin, end);
 }
 
-/// Construct an ArrayRef from a SmallVector.
+/// Construct an array_ref from a small_vector.
 template <typename T>
-ArrayRef<T> makeArrayRef(const SmallVectorImpl<T> &Vec) {
+array_ref<T> make_array_ref(const small_vector_impl<T> &Vec) {
   return Vec;
 }
 
-/// Construct an ArrayRef from a SmallVector.
+/// Construct an array_ref from a small_vector.
 template <typename T, unsigned N>
-ArrayRef<T> makeArrayRef(const SmallVector<T, N> &Vec) {
+array_ref<T> make_array_ref(const small_vector<T, N> &Vec) {
   return Vec;
 }
 
-/// Construct an ArrayRef from a std::vector.
+/// Construct an array_ref from a std::vector.
 template <typename T>
-ArrayRef<T> makeArrayRef(const std::vector<T> &Vec) {
+array_ref<T> make_array_ref(const std::vector<T> &Vec) {
   return Vec;
 }
 
-/// Construct an ArrayRef from a std::array.
+/// Construct an array_ref from a std::array.
 template <typename T, std::size_t N>
-ArrayRef<T> makeArrayRef(const std::array<T, N> &Arr) {
+array_ref<T> make_array_ref(const std::array<T, N> &Arr) {
   return Arr;
 }
 
-/// Construct an ArrayRef from an ArrayRef (no-op) (const)
+/// Construct an array_ref from an array_ref (no-op) (const)
 template <typename T>
-ArrayRef<T> makeArrayRef(const ArrayRef<T> &Vec) {
+array_ref<T> make_array_ref(const array_ref<T> &Vec) {
   return Vec;
 }
 
-/// Construct an ArrayRef from an ArrayRef (no-op)
+/// Construct an array_ref from an array_ref (no-op)
 template <typename T>
-ArrayRef<T> &makeArrayRef(ArrayRef<T> &Vec) {
+array_ref<T> &make_array_ref(array_ref<T> &Vec) {
   return Vec;
 }
 
-/// Construct an ArrayRef from a C array.
+/// Construct an array_ref from a C array.
 template <typename T, size_t N>
-ArrayRef<T> makeArrayRef(const T (&Arr)[N]) {
-  return ArrayRef<T>(Arr);
+array_ref<T> make_array_ref(const T (&Arr)[N]) {
+  return array_ref<T>(Arr);
 }
 
 /// @}
-/// @name ArrayRef Comparison Operators
+/// @name array_ref Comparison Operators
 /// @{
 
 template <typename T>
-inline bool operator==(ArrayRef<T> LHS, ArrayRef<T> RHS) {
+inline bool operator==(array_ref<T> LHS, array_ref<T> RHS) {
   return LHS.equals(RHS);
 }
 
 template <typename T>
-inline bool operator==(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
-  return ArrayRef<T>(LHS).equals(RHS);
+inline bool operator==(small_vector_impl<T> &LHS, array_ref<T> RHS) {
+  return array_ref<T>(LHS).equals(RHS);
 }
 
 template <typename T>
-inline bool operator!=(ArrayRef<T> LHS, ArrayRef<T> RHS) {
+inline bool operator!=(array_ref<T> LHS, array_ref<T> RHS) {
   return !(LHS == RHS);
 }
 
 template <typename T>
-inline bool operator!=(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
+inline bool operator!=(small_vector_impl<T> &LHS, array_ref<T> RHS) {
   return !(LHS == RHS);
 }
 
diff --git a/paddle/utils/array_ref_test.cc b/paddle/utils/array_ref_test.cc
index 33a09c499246d..cc4e88a5ee351 100644
--- a/paddle/utils/array_ref_test.cc
+++ b/paddle/utils/array_ref_test.cc
@@ -21,53 +21,53 @@
 #include "gtest/gtest.h"
 
 TEST(array_ref, array_ref) {
-  paddle::ArrayRef<int> a;
+  paddle::array_ref<int> a;
   CHECK_EQ(a.size(), size_t(0));
   CHECK_EQ(a.data(), static_cast<int*>(nullptr));
 
-  paddle::ArrayRef<int> b(paddle::none);
+  paddle::array_ref<int> b(paddle::none);
   CHECK_EQ(b.size(), size_t(0));
   CHECK_EQ(b.data(), static_cast<int*>(nullptr));
 
   int v = 1;
-  paddle::ArrayRef<int> c(v);
+  paddle::array_ref<int> c(v);
   CHECK_EQ(c.size(), size_t(1));
   CHECK_EQ(c.data(), &v);
-  CHECK_EQ(c.equals(paddle::makeArrayRef(v)), true);
+  CHECK_EQ(c.equals(paddle::make_array_ref(v)), true);
 
   int v1[5] = {1, 2, 3, 4, 5};
-  paddle::ArrayRef<int> d(v1, 5);
+  paddle::array_ref<int> d(v1, 5);
   CHECK_EQ(d.size(), size_t(5));
   CHECK_EQ(d.data(), v1);
-  CHECK_EQ(d.equals(paddle::makeArrayRef(v1, 5)), true);
+  CHECK_EQ(d.equals(paddle::make_array_ref(v1, 5)), true);
 
-  paddle::ArrayRef<int> e(&v1[0], &v1[4]);
+  paddle::array_ref<int> e(&v1[0], &v1[4]);
   CHECK_EQ(e.size(), size_t(4));
   CHECK_EQ(e.data(), v1);
-  CHECK_EQ(e.equals(paddle::makeArrayRef(&v1[0], &v1[4])), true);
+  CHECK_EQ(e.equals(paddle::make_array_ref(&v1[0], &v1[4])), true);
 
-  paddle::SmallVector<int, 3> small_vector{1, 2, 3};
-  paddle::ArrayRef<int> f(small_vector);
+  paddle::small_vector<int, 3> small_vector{1, 2, 3};
+  paddle::array_ref<int> f(small_vector);
   CHECK_EQ(f.size(), size_t(3));
   CHECK_EQ(f.data(), small_vector.data());
-  CHECK_EQ(f.equals(paddle::makeArrayRef(small_vector)), true);
+  CHECK_EQ(f.equals(paddle::make_array_ref(small_vector)), true);
 
   std::vector<int> vector{1, 2, 3};
-  paddle::ArrayRef<int> g(vector);
+  paddle::array_ref<int> g(vector);
   CHECK_EQ(g.size(), size_t(3));
   CHECK_EQ(g.data(), vector.data());
-  CHECK_EQ(g.equals(paddle::makeArrayRef(vector)), true);
+  CHECK_EQ(g.equals(paddle::make_array_ref(vector)), true);
 
   std::initializer_list<int> list = {1, 2, 3};
-  paddle::ArrayRef<int> h(list);
+  paddle::array_ref<int> h(list);
   CHECK_EQ(h.size(), size_t(3));
   CHECK_EQ(h.data(), list.begin());
 
-  paddle::ArrayRef<int> i(h);
+  paddle::array_ref<int> i(h);
   CHECK_EQ(i.size(), size_t(3));
   CHECK_EQ(i.data(), list.begin());
   CHECK_EQ(i.equals(h), true);
-  CHECK_EQ(i.equals(paddle::makeArrayRef(h)), true);
+  CHECK_EQ(i.equals(paddle::make_array_ref(h)), true);
 
   auto slice = i.slice(1, 2);
   CHECK_EQ(slice.size(), size_t(2));
@@ -78,7 +78,7 @@ TEST(array_ref, array_ref) {
   CHECK_EQ(drop.size(), size_t(1));
   CHECK_EQ(drop[0], 3);
 
-  paddle::ArrayRef<int> nums = {1, 2, 3, 4, 5, 6, 7, 8};
+  paddle::array_ref<int> nums = {1, 2, 3, 4, 5, 6, 7, 8};
   auto front = nums.take_front(3);
   CHECK_EQ(front.size(), size_t(3));
   for (size_t i = 0; i < 3; ++i) {
diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h
index 14cb8f410f460..27db9ae18822a 100644
--- a/paddle/utils/small_vector.h
+++ b/paddle/utils/small_vector.h
@@ -5,6 +5,7 @@
 // 3. add at(index) method for small vector
 // 4. wrap the call to max and min with parenthesis to prevent the macro
 // expansion to fix the build error on windows platform
+// 5. change SmallVector to small_vector to unify naming style of utils
 
 //===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
 //
@@ -79,13 +80,13 @@ iterator_range<T> make_range(std::pair<T, T> p) {
 /// This is all the stuff common to all SmallVectors.
 ///
 /// The template parameter specifies the type which should be used to hold the
-/// Size and Capacity of the SmallVector, so it can be adjusted.
-/// Using 32 bit size is desirable to shrink the size of the SmallVector.
-/// Using 64 bit size is desirable for cases like SmallVector<char>, where a
+/// Size and Capacity of the small_vector, so it can be adjusted.
+/// Using 32 bit size is desirable to shrink the size of the small_vector.
+/// Using 64 bit size is desirable for cases like small_vector<char>, where a
 /// 32 bit size would limit the vector to ~4GB. SmallVectors are used for
 /// buffering bitcode output - which can exceed 4GB.
 template <class Size_T>
-class SmallVectorBase {
+class small_vector_base {
  protected:
   void *BeginX;
   Size_T Size = 0, Capacity;
@@ -95,8 +96,8 @@ class SmallVectorBase {
     return (std::numeric_limits<Size_T>::max)();
   }
 
-  SmallVectorBase() = delete;
-  SmallVectorBase(void *FirstEl, size_t TotalCapacity)
+  small_vector_base() = delete;
+  small_vector_base(void *FirstEl, size_t TotalCapacity)
       : BeginX(FirstEl), Capacity(TotalCapacity) {}
 
   /// This is a helper for \a grow() that's out of line to reduce code
@@ -139,22 +140,23 @@ using SmallVectorSizeType =
 /// Figure out the offset of the first element.
 template <class T, typename = void>
 struct SmallVectorAlignmentAndSize {
-  alignas(SmallVectorBase<SmallVectorSizeType<T>>) char Base[sizeof(
-      SmallVectorBase<SmallVectorSizeType<T>>)];
+  alignas(small_vector_base<SmallVectorSizeType<T>>) char Base[sizeof(
+      small_vector_base<SmallVectorSizeType<T>>)];
   alignas(T) char FirstEl[sizeof(T)];
 };
 
-/// This is the part of SmallVectorTemplateBase which does not depend on whether
-/// the type T is a POD. The extra dummy template argument is used by ArrayRef
+/// This is the part of small_vector_template_base which does not depend on
+/// whether
+/// the type T is a POD. The extra dummy template argument is used by array_ref
 /// to avoid unnecessarily requiring T to be complete.
 template <typename T, typename = void>
-class SmallVectorTemplateCommon
-    : public SmallVectorBase<SmallVectorSizeType<T>> {
-  using Base = SmallVectorBase<SmallVectorSizeType<T>>;
+class small_vector_template_common
+    : public small_vector_base<SmallVectorSizeType<T>> {
+  using Base = small_vector_base<SmallVectorSizeType<T>>;
 
   /// Find the address of the first element.  For this pointer math to be valid
   /// with small-size of 0 for T with lots of alignment, it's important that
-  /// SmallVectorStorage is properly-aligned even for small-size of 0.
+  /// small_vector_storage is properly-aligned even for small-size of 0.
   void *getFirstEl() const {
     return const_cast<void *>(reinterpret_cast<const void *>(
         reinterpret_cast<const char *>(this) +
@@ -163,7 +165,7 @@ class SmallVectorTemplateCommon
   // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
 
  protected:
-  SmallVectorTemplateCommon(size_t Size) : Base(getFirstEl(), Size) {}
+  small_vector_template_common(size_t Size) : Base(getFirstEl(), Size) {}
 
   void grow_pod(size_t MinSize, size_t TSize) {
     Base::grow_pod(getFirstEl(), MinSize, TSize);
@@ -358,7 +360,7 @@ class SmallVectorTemplateCommon
   }
 };
 
-/// SmallVectorTemplateBase<TriviallyCopyable = false> - This is where we put
+/// small_vector_template_base<TriviallyCopyable = false> - This is where we put
 /// method implementations that are designed to work with non-trivial T's.
 ///
 /// We approximate is_trivially_copyable with trivial move/copy construction and
@@ -370,14 +372,15 @@ template <typename T,
           bool = (std::is_trivially_copy_constructible<T>::value) &&
                  (std::is_trivially_move_constructible<T>::value) &&
                  std::is_trivially_destructible<T>::value>
-class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
-  friend class SmallVectorTemplateCommon<T>;
+class small_vector_template_base : public small_vector_template_common<T> {
+  friend class small_vector_template_common<T>;
 
  protected:
   static constexpr bool TakesParamByValue = false;
   using ValueParamT = const T &;
 
-  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+  small_vector_template_base(size_t Size)
+      : small_vector_template_common<T>(Size) {}
 
   static void destroy_range(T *S, T *E) {
     while (S != E) {
@@ -410,7 +413,7 @@ class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
   /// in \p NewCapacity. This is the first section of \a grow().
   T *mallocForGrow(size_t MinSize, size_t &NewCapacity) {
     return static_cast<T *>(
-        SmallVectorBase<SmallVectorSizeType<T>>::mallocForGrow(
+        small_vector_base<SmallVectorSizeType<T>>::mallocForGrow(
             MinSize, sizeof(T), NewCapacity));
   }
 
@@ -480,7 +483,7 @@ class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
 
 // Define this out-of-line to dissuade the C++ compiler from inlining it.
 template <typename T, bool TriviallyCopyable>
-void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
+void small_vector_template_base<T, TriviallyCopyable>::grow(size_t MinSize) {
   size_t NewCapacity;
   T *NewElts = mallocForGrow(MinSize, NewCapacity);
   moveElementsForGrow(NewElts);
@@ -489,7 +492,7 @@ void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
 
 // Define this out-of-line to dissuade the C++ compiler from inlining it.
 template <typename T, bool TriviallyCopyable>
-void SmallVectorTemplateBase<T, TriviallyCopyable>::moveElementsForGrow(
+void small_vector_template_base<T, TriviallyCopyable>::moveElementsForGrow(
     T *NewElts) {
   // Move the elements over.
   this->uninitialized_move(this->begin(), this->end(), NewElts);
@@ -500,7 +503,7 @@ void SmallVectorTemplateBase<T, TriviallyCopyable>::moveElementsForGrow(
 
 // Define this out-of-line to dissuade the C++ compiler from inlining it.
 template <typename T, bool TriviallyCopyable>
-void SmallVectorTemplateBase<T, TriviallyCopyable>::takeAllocationForGrow(
+void small_vector_template_base<T, TriviallyCopyable>::takeAllocationForGrow(
     T *NewElts, size_t NewCapacity) {
   // If this wasn't grown from the inline copy, deallocate the old space.
   if (!this->isSmall()) free(this->begin());
@@ -509,13 +512,14 @@ void SmallVectorTemplateBase<T, TriviallyCopyable>::takeAllocationForGrow(
   this->Capacity = NewCapacity;
 }
 
-/// SmallVectorTemplateBase<TriviallyCopyable = true> - This is where we put
+/// small_vector_template_base<TriviallyCopyable = true> - This is where we put
 /// method implementations that are designed to work with trivially copyable
 /// T's. This allows using memcpy in place of copy/move construction and
 /// skipping destruction.
 template <typename T>
-class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
-  friend class SmallVectorTemplateCommon<T>;
+class small_vector_template_base<T, true>
+    : public small_vector_template_common<T> {
+  friend class small_vector_template_common<T>;
 
  protected:
   /// True if it's cheap enough to take parameters by value. Doing so avoids
@@ -527,7 +531,8 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
   using ValueParamT =
       typename std::conditional<TakesParamByValue, T, const T &>::type;
 
-  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+  small_vector_template_base(size_t Size)
+      : small_vector_template_common<T>(Size) {}
 
   // No need to do a destroy loop for POD's.
   static void destroy_range(T *, T *) {}
@@ -557,7 +562,7 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
       T2 *Dest,
       std::enable_if_t<std::is_same<typename std::remove_const<T1>::type,
                                     T2>::value> * = nullptr) {
-    // Use memcpy for PODs iterated by pointers (which includes SmallVector
+    // Use memcpy for PODs iterated by pointers (which includes small_vector
     // iterators): std::uninitialized_copy optimizes to memmove, but we can
     // use memcpy here. Note that I and E are iterators and thus might be
     // invalid for memcpy if they are equal.
@@ -612,11 +617,11 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
   void pop_back() { this->set_size(this->size() - 1); }
 };
 
-/// This class consists of common code factored out of the SmallVector class to
-/// reduce code duplication based on the SmallVector 'N' template parameter.
+/// This class consists of common code factored out of the small_vector class to
+/// reduce code duplication based on the small_vector 'N' template parameter.
 template <typename T>
-class SmallVectorImpl : public SmallVectorTemplateBase<T> {
-  using SuperClass = SmallVectorTemplateBase<T>;
+class small_vector_impl : public small_vector_template_base<T> {
+  using SuperClass = small_vector_template_base<T>;
 
  public:
   using iterator = typename SuperClass::iterator;
@@ -625,16 +630,16 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T> {
   using size_type = typename SuperClass::size_type;
 
  protected:
-  using SmallVectorTemplateBase<T>::TakesParamByValue;
+  using small_vector_template_base<T>::TakesParamByValue;
   using ValueParamT = typename SuperClass::ValueParamT;
 
   // Default ctor - Initialize to empty.
-  explicit SmallVectorImpl(unsigned N) : SmallVectorTemplateBase<T>(N) {}
+  explicit small_vector_impl(unsigned N) : small_vector_template_base<T>(N) {}
 
  public:
-  SmallVectorImpl(const SmallVectorImpl &) = delete;
+  small_vector_impl(const small_vector_impl &) = delete;
 
-  ~SmallVectorImpl() {
+  ~small_vector_impl() {
     // Subclass has already destructed this vector's elements.
     // If this wasn't grown from the inline copy, deallocate the old space.
     if (!this->isSmall()) free(this->begin());
@@ -695,9 +700,9 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T> {
     return Result;
   }
 
-  void swap(SmallVectorImpl &RHS);
+  void swap(small_vector_impl &RHS);
 
-  /// Add the specified range to the end of the SmallVector.
+  /// Add the specified range to the end of the small_vector.
   template <typename in_iter,
             typename = std::enable_if_t<std::is_convertible<
                 typename std::iterator_traits<in_iter>::iterator_category,
@@ -719,7 +724,7 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T> {
 
   void append(std::initializer_list<T> IL) { append(IL.begin(), IL.end()); }
 
-  void append(const SmallVectorImpl &RHS) { append(RHS.begin(), RHS.end()); }
+  void append(const small_vector_impl &RHS) { append(RHS.begin(), RHS.end()); }
 
   void assign(size_type NumElts, ValueParamT Elt) {
     // Note that Elt could be an internal reference.
@@ -755,7 +760,7 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T> {
     append(IL);
   }
 
-  void assign(const SmallVectorImpl &RHS) { assign(RHS.begin(), RHS.end()); }
+  void assign(const small_vector_impl &RHS) { assign(RHS.begin(), RHS.end()); }
 
   iterator erase(const_iterator CI) {
     // Just cast away constness because this is a non-const member function.
@@ -976,24 +981,26 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T> {
     return this->back();
   }
 
-  SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
+  small_vector_impl &operator=(const small_vector_impl &RHS);
 
-  SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
+  small_vector_impl &operator=(small_vector_impl &&RHS);
 
-  bool operator==(const SmallVectorImpl &RHS) const {
+  bool operator==(const small_vector_impl &RHS) const {
     if (this->size() != RHS.size()) return false;
     return std::equal(this->begin(), this->end(), RHS.begin());
   }
-  bool operator!=(const SmallVectorImpl &RHS) const { return !(*this == RHS); }
+  bool operator!=(const small_vector_impl &RHS) const {
+    return !(*this == RHS);
+  }
 
-  bool operator<(const SmallVectorImpl &RHS) const {
+  bool operator<(const small_vector_impl &RHS) const {
     return std::lexicographical_compare(
         this->begin(), this->end(), RHS.begin(), RHS.end());
   }
 };
 
 template <typename T>
-void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
+void small_vector_impl<T>::swap(small_vector_impl<T> &RHS) {
   if (this == &RHS) return;
 
   // We can only avoid copying elements if neither vector is small.
@@ -1028,8 +1035,8 @@ void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
 }
 
 template <typename T>
-SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(
-    const SmallVectorImpl<T> &RHS) {
+small_vector_impl<T> &small_vector_impl<T>::operator=(
+    const small_vector_impl<T> &RHS) {
   // Avoid self-assignment.
   if (this == &RHS) return *this;
 
@@ -1076,7 +1083,8 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(
 }
 
 template <typename T>
-SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
+small_vector_impl<T> &small_vector_impl<T>::operator=(
+    small_vector_impl<T> &&RHS) {
   // Avoid self-assignment.
   if (this == &RHS) return *this;
 
@@ -1135,38 +1143,38 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
   return *this;
 }
 
-/// Storage for the SmallVector elements.  This is specialized for the N=0 case
+/// Storage for the small_vector elements.  This is specialized for the N=0 case
 /// to avoid allocating unnecessary storage.
 template <typename T, unsigned N>
-struct SmallVectorStorage {
+struct small_vector_storage {
   alignas(T) char InlineElts[N * sizeof(T)];
 };
 
 /// We need the storage to be properly aligned even for small-size of 0 so that
-/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
+/// the pointer math in \a small_vector_template_common::getFirstEl() is
 /// well-defined.
 template <typename T>
-struct alignas(T) SmallVectorStorage<T, 0> {};
+struct alignas(T) small_vector_storage<T, 0> {};
 
-/// Forward declaration of SmallVector so that
+/// Forward declaration of small_vector so that
 /// calculateSmallVectorDefaultInlinedElements can reference
-/// `sizeof(SmallVector<T, 0>)`.
+/// `sizeof(small_vector<T, 0>)`.
 template <typename T, unsigned N>
-class SmallVector;
+class small_vector;
 
 /// Helper class for calculating the default number of inline elements for
-/// `SmallVector<T>`.
+/// `small_vector<T>`.
 ///
 /// This should be migrated to a constexpr function when our minimum
 /// compiler support is enough for multi-statement constexpr functions.
 template <typename T>
 struct CalculateSmallVectorDefaultInlinedElements {
   // Parameter controlling the default number of inlined elements
-  // for `SmallVector<T>`.
+  // for `small_vector<T>`.
   //
   // The default number of inlined elements ensures that
   // 1. There is at least one inlined element.
-  // 2. `sizeof(SmallVector<T>) <= kPreferredSmallVectorSizeof` unless
+  // 2. `sizeof(small_vector<T>) <= kPreferredSmallVectorSizeof` unless
   // it contradicts 1.
   static constexpr size_t kPreferredSmallVectorSizeof = 64;
 
@@ -1175,14 +1183,14 @@ struct CalculateSmallVectorDefaultInlinedElements {
   // Because our policy guarantees at least one inlined element, it is possible
   // for an arbitrarily large inlined element to allocate an arbitrarily large
   // amount of inline storage. We generally consider it an antipattern for a
-  // SmallVector to allocate an excessive amount of inline storage, so we want
+  // small_vector to allocate an excessive amount of inline storage, so we want
   // to call attention to these cases and make sure that users are making an
   // intentional decision if they request a lot of inline storage.
   //
   // We want this assertion to trigger in pathological cases, but otherwise
   // not be too easy to hit. To accomplish that, the cutoff is actually somewhat
   // larger than kPreferredSmallVectorSizeof (otherwise,
-  // `SmallVector<SmallVector<T>>` would be one easy way to trip it, and that
+  // `small_vector<small_vector<T>>` would be one easy way to trip it, and that
   // pattern seems useful in practice).
   //
   // One wrinkle is that this assertion is in theory non-portable, since
@@ -1195,14 +1203,14 @@ struct CalculateSmallVectorDefaultInlinedElements {
   static_assert(
       sizeof(T) <= 256,
       "You are trying to use a default number of inlined elements for "
-      "`SmallVector<T>` but `sizeof(T)` is really big! Please use an "
-      "explicit number of inlined elements with `SmallVector<T, N>` to make "
+      "`small_vector<T>` but `sizeof(T)` is really big! Please use an "
+      "explicit number of inlined elements with `small_vector<T, N>` to make "
       "sure you really want that much inline storage.");
 
   // Discount the size of the header itself when calculating the maximum inline
   // bytes.
   static constexpr size_t PreferredInlineBytes =
-      kPreferredSmallVectorSizeof - sizeof(SmallVector<T, 0>);
+      kPreferredSmallVectorSizeof - sizeof(small_vector<T, 0>);
   static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T);
   static constexpr size_t value =
       NumElementsThatFit == 0 ? 1 : NumElementsThatFit;
@@ -1216,27 +1224,27 @@ struct CalculateSmallVectorDefaultInlinedElements {
 ///
 /// \note
 /// In the absence of a well-motivated choice for the number of inlined
-/// elements \p N, it is recommended to use \c SmallVector<T> (that is,
+/// elements \p N, it is recommended to use \c small_vector<T> (that is,
 /// omitting the \p N). This will choose a default number of inlined elements
 /// reasonable for allocation on the stack (for example, trying to keep \c
-/// sizeof(SmallVector<T>) around 64 bytes).
+/// sizeof(small_vector<T>) around 64 bytes).
 ///
 /// \warning This does not attempt to be exception safe.
 ///
 /// \see https://llvm.org/docs/ProgrammersManual.html#llvm-adt-smallvector-h
 template <typename T,
           unsigned N = CalculateSmallVectorDefaultInlinedElements<T>::value>
-class SmallVector : public SmallVectorImpl<T>, SmallVectorStorage<T, N> {
+class small_vector : public small_vector_impl<T>, small_vector_storage<T, N> {
  public:
-  SmallVector() : SmallVectorImpl<T>(N) {}
+  small_vector() : small_vector_impl<T>(N) {}
 
-  ~SmallVector() {
+  ~small_vector() {
     // Destroy the constructed elements in the vector.
     this->destroy_range(this->begin(), this->end());
   }
 
-  explicit SmallVector(size_t Size, const T &Value = T())
-      : SmallVectorImpl<T>(N) {
+  explicit small_vector(size_t Size, const T &Value = T())
+      : small_vector_impl<T>(N) {
     this->assign(Size, Value);
   }
 
@@ -1244,65 +1252,65 @@ class SmallVector : public SmallVectorImpl<T>, SmallVectorStorage<T, N> {
             typename = std::enable_if_t<std::is_convertible<
                 typename std::iterator_traits<ItTy>::iterator_category,
                 std::input_iterator_tag>::value>>
-  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
+  small_vector(ItTy S, ItTy E) : small_vector_impl<T>(N) {
     this->append(S, E);
   }
 
   template <typename RangeTy>
-  explicit SmallVector(const iterator_range<RangeTy> &R)
-      : SmallVectorImpl<T>(N) {
+  explicit small_vector(const iterator_range<RangeTy> &R)
+      : small_vector_impl<T>(N) {
     this->append(R.begin(), R.end());
   }
 
-  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
+  small_vector(std::initializer_list<T> IL) : small_vector_impl<T>(N) {
     this->assign(IL);
   }
 
-  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
-    if (!RHS.empty()) SmallVectorImpl<T>::operator=(RHS);
+  small_vector(const small_vector &RHS) : small_vector_impl<T>(N) {
+    if (!RHS.empty()) small_vector_impl<T>::operator=(RHS);
   }
 
-  SmallVector &operator=(const SmallVector &RHS) {
-    SmallVectorImpl<T>::operator=(RHS);
+  small_vector &operator=(const small_vector &RHS) {
+    small_vector_impl<T>::operator=(RHS);
     return *this;
   }
 
-  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
-    if (!RHS.empty()) SmallVectorImpl<T>::operator=(::std::move(RHS));
+  small_vector(small_vector &&RHS) : small_vector_impl<T>(N) {
+    if (!RHS.empty()) small_vector_impl<T>::operator=(::std::move(RHS));
   }
 
-  SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
-    if (!RHS.empty()) SmallVectorImpl<T>::operator=(::std::move(RHS));
+  small_vector(small_vector_impl<T> &&RHS) : small_vector_impl<T>(N) {
+    if (!RHS.empty()) small_vector_impl<T>::operator=(::std::move(RHS));
   }
 
-  SmallVector &operator=(SmallVector &&RHS) {
-    SmallVectorImpl<T>::operator=(::std::move(RHS));
+  small_vector &operator=(small_vector &&RHS) {
+    small_vector_impl<T>::operator=(::std::move(RHS));
     return *this;
   }
 
-  SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
-    SmallVectorImpl<T>::operator=(::std::move(RHS));
+  small_vector &operator=(small_vector_impl<T> &&RHS) {
+    small_vector_impl<T>::operator=(::std::move(RHS));
     return *this;
   }
 
-  SmallVector &operator=(std::initializer_list<T> IL) {
+  small_vector &operator=(std::initializer_list<T> IL) {
     this->assign(IL);
     return *this;
   }
 };
 
 template <typename T, unsigned N>
-inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
+inline size_t capacity_in_bytes(const small_vector<T, N> &X) {
   return X.capacity_in_bytes();
 }
 
 /// Given a range of type R, iterate the entire range and return a
-/// SmallVector with elements of the vector.  This is useful, for example,
+/// small_vector with elements of the vector.  This is useful, for example,
 /// when you want to iterate a range and then sort the results.
 template <unsigned Size, typename R>
-SmallVector<typename std::remove_const<typename std::remove_reference<
-                decltype(*std::begin(std::declval<R &>()))>::type>::type,
-            Size>
+small_vector<typename std::remove_const<typename std::remove_reference<
+                 decltype(*std::begin(std::declval<R &>()))>::type>::type,
+             Size>
 to_vector(R &&Range) {
   return {std::begin(Range), std::end(Range)};
 }
@@ -1352,22 +1360,22 @@ struct Struct32B {
   alignas(32) void *X;
 };
 }
-static_assert(sizeof(SmallVector<void *, 0>) ==
+static_assert(sizeof(small_vector<void *, 0>) ==
                   sizeof(unsigned) * 2 + sizeof(void *),
-              "wasted space in SmallVector size 0");
-static_assert(alignof(SmallVector<Struct16B, 0>) >= alignof(Struct16B),
+              "wasted space in small_vector size 0");
+static_assert(alignof(small_vector<Struct16B, 0>) >= alignof(Struct16B),
               "wrong alignment for 16-byte aligned T");
-static_assert(alignof(SmallVector<Struct32B, 0>) >= alignof(Struct32B),
+static_assert(alignof(small_vector<Struct32B, 0>) >= alignof(Struct32B),
               "wrong alignment for 32-byte aligned T");
-static_assert(sizeof(SmallVector<Struct16B, 0>) >= alignof(Struct16B),
+static_assert(sizeof(small_vector<Struct16B, 0>) >= alignof(Struct16B),
               "missing padding for 16-byte aligned T");
-static_assert(sizeof(SmallVector<Struct32B, 0>) >= alignof(Struct32B),
+static_assert(sizeof(small_vector<Struct32B, 0>) >= alignof(Struct32B),
               "missing padding for 32-byte aligned T");
-static_assert(sizeof(SmallVector<void *, 1>) ==
+static_assert(sizeof(small_vector<void *, 1>) ==
                   sizeof(unsigned) * 2 + sizeof(void *) * 2,
-              "wasted space in SmallVector size 1");
+              "wasted space in small_vector size 1");
 
-static_assert(sizeof(SmallVector<char, 0>) ==
+static_assert(sizeof(small_vector<char, 0>) ==
                   sizeof(void *) * 2 + sizeof(void *),
               "1 byte elements have word-sized type for size and capacity");
 
@@ -1375,7 +1383,7 @@ static_assert(sizeof(SmallVector<char, 0>) ==
 /// std::length_error or calls report_fatal_error.
 static void report_size_overflow(size_t MinSize, size_t MaxSize);
 static void report_size_overflow(size_t MinSize, size_t MaxSize) {
-  std::string Reason = "SmallVector unable to grow. Requested capacity (" +
+  std::string Reason = "small_vector unable to grow. Requested capacity (" +
                        std::to_string(MinSize) +
                        ") is larger than maximum value for size type (" +
                        std::to_string(MaxSize) + ")";
@@ -1387,7 +1395,7 @@ static void report_size_overflow(size_t MinSize, size_t MaxSize) {
 static void report_at_maximum_capacity(size_t MaxSize);
 static void report_at_maximum_capacity(size_t MaxSize) {
   std::string Reason =
-      "SmallVector capacity unable to grow. Already at maximum size " +
+      "small_vector capacity unable to grow. Already at maximum size " +
       std::to_string(MaxSize);
   throw std::length_error(Reason);
 }
@@ -1415,18 +1423,18 @@ static size_t getNewCapacity(size_t MinSize, size_t TSize, size_t OldCapacity) {
 
 // Note: Moving this function into the header may cause performance regression.
 template <class Size_T>
-void *SmallVectorBase<Size_T>::mallocForGrow(size_t MinSize,
-                                             size_t TSize,
-                                             size_t &NewCapacity) {
+void *small_vector_base<Size_T>::mallocForGrow(size_t MinSize,
+                                               size_t TSize,
+                                               size_t &NewCapacity) {
   NewCapacity = getNewCapacity<Size_T>(MinSize, TSize, this->capacity());
   return safe_malloc(NewCapacity * TSize);
 }
 
 // Note: Moving this function into the header may cause performance regression.
 template <class Size_T>
-void SmallVectorBase<Size_T>::grow_pod(void *FirstEl,
-                                       size_t MinSize,
-                                       size_t TSize) {
+void small_vector_base<Size_T>::grow_pod(void *FirstEl,
+                                         size_t MinSize,
+                                         size_t TSize) {
   size_t NewCapacity = getNewCapacity<Size_T>(MinSize, TSize, this->capacity());
   void *NewElts;
   if (BeginX == FirstEl) {
@@ -1443,38 +1451,38 @@ void SmallVectorBase<Size_T>::grow_pod(void *FirstEl,
   this->Capacity = NewCapacity;
 }
 
-template class paddle::SmallVectorBase<uint32_t>;
+template class paddle::small_vector_base<uint32_t>;
 
 // Disable the uint64_t instantiation for 32-bit builds.
 // Both uint32_t and uint64_t instantiations are needed for 64-bit builds.
 // This instantiation will never be used in 32-bit builds, and will cause
 // warnings when sizeof(Size_T) > sizeof(size_t).
 #if SIZE_MAX > UINT32_MAX
-template class paddle::SmallVectorBase<uint64_t>;
+template class paddle::small_vector_base<uint64_t>;
 
 // Assertions to ensure this #if stays in sync with SmallVectorSizeType.
 static_assert(sizeof(SmallVectorSizeType<char>) == sizeof(uint64_t),
-              "Expected SmallVectorBase<uint64_t> variant to be in use.");
+              "Expected small_vector_base<uint64_t> variant to be in use.");
 #else
 static_assert(sizeof(SmallVectorSizeType<char>) == sizeof(uint32_t),
-              "Expected SmallVectorBase<uint32_t> variant to be in use.");
+              "Expected small_vector_base<uint32_t> variant to be in use.");
 #endif
 
 }  // namespace paddle
 
 namespace std {
 
-/// Implement std::swap in terms of SmallVector swap.
+/// Implement std::swap in terms of small_vector swap.
 template <typename T>
-inline void swap(paddle::SmallVectorImpl<T> &LHS,
-                 paddle::SmallVectorImpl<T> &RHS) {
+inline void swap(paddle::small_vector_impl<T> &LHS,
+                 paddle::small_vector_impl<T> &RHS) {
   LHS.swap(RHS);
 }
 
-/// Implement std::swap in terms of SmallVector swap.
+/// Implement std::swap in terms of small_vector swap.
 template <typename T, unsigned N>
-inline void swap(paddle::SmallVector<T, N> &LHS,
-                 paddle::SmallVector<T, N> &RHS) {
+inline void swap(paddle::small_vector<T, N> &LHS,
+                 paddle::small_vector<T, N> &RHS) {
   LHS.swap(RHS);
 }
 
diff --git a/paddle/utils/small_vector_test.cc b/paddle/utils/small_vector_test.cc
index 96bcec5940056..e061c232152c5 100644
--- a/paddle/utils/small_vector_test.cc
+++ b/paddle/utils/small_vector_test.cc
@@ -21,7 +21,7 @@
 #include "gtest/gtest.h"
 
 template <typename T, unsigned N>
-static std::vector<T> ToStdVector(const paddle::SmallVector<T, N> &vec) {
+static std::vector<T> ToStdVector(const paddle::small_vector<T, N> &vec) {
   std::vector<T> std_vec;
   std_vec.reserve(vec.size());
   for (size_t i = 0; i < vec.size(); ++i) {
@@ -35,7 +35,7 @@ void SmallVectorCheck(size_t n) {
   std::srand(std::time(nullptr));
 
   std::vector<int> std_vec;
-  paddle::SmallVector<int, N> vec;
+  paddle::small_vector<int, N> vec;
 
   for (size_t i = 0; i < n; ++i) {
     int value = rand();  // NOLINT
diff --git a/paddle/utils/variant.h b/paddle/utils/variant.h
index 7b11ae1bee88c..50bdc4287e21a 100644
--- a/paddle/utils/variant.h
+++ b/paddle/utils/variant.h
@@ -13,6 +13,10 @@
 
 #pragma once
 
+// gcc >= 9 has a bug that creates a false positive warning.
+// Reference:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92145
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89381
 #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 9
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-copy"
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index e303ce1216822..408a1fdaafeef 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -648,6 +648,9 @@ def complete_forward_annotation(self, serial_main_program):
         self._dist_context.copy_dist_attr_from_graph_to_program()
         self._dist_context.clear_dist_info_for_graph()
 
+        # NOTE:[HighOrderGrad] update vars and ops distributed attribute in high order gradient
+        self.complete_high_order_grad_annotation(serial_main_program)
+
         # Do the validation check and amend some completion
         self._dist_context.amend_dist_attr_for_program()
 
@@ -655,6 +658,164 @@ def complete_forward_annotation(self, serial_main_program):
 
         return serial_main_program
 
+    def complete_high_order_grad_annotation(self, serial_main_program):
+        """
+        NOTE: 
+            [HighOrderGrad] Complete the annotation of vars and ops only for high order gradient.
+            This function is temporary to support high order gradient, and will be removed in the future.
+        """
+
+        def _is_grad_var_name(name):
+            if "@GRAD" in name:
+                return True
+            return False
+
+        def _get_op_by_id(ops, id):
+            for op in ops:
+                if op.desc.id() == id:
+                    return op
+            return None
+
+        ops = list(serial_main_program.global_block().ops)
+        vars = serial_main_program.global_block().vars
+        dist_op_context = self._dist_context.dist_op_context
+        grad_var_to_var = dist_op_context.grad_var_to_var
+
+        appended_grad_times = 0
+        for idx in range(0, len(ops)):
+            op = ops[idx]
+            if int(op.attr('op_role')) == int(
+                    core.op_proto_and_checker_maker.OpRole.Forward):
+                continue
+
+            if int(op.attr('op_role')) == int(
+                    core.op_proto_and_checker_maker.OpRole.Backward) and int(
+                        ops[idx - 1].attr('op_role')) == int(
+                            core.op_proto_and_checker_maker.OpRole.Forward):
+                appended_grad_times += 1
+
+            # complete the annotation of grad op (xxx_grad op or sum op)
+            # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
+            grad_op = ops[idx]
+            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+                # TODO support the case where one forward op corresponding to multiple xxx_grad op
+                forward_op = _get_op_by_id(
+                    ops, dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
+                assert forward_op is not None
+
+                fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
+                    forward_op)
+                fwd_op_process_mesh = fwd_op_dist_attr.process_mesh
+                grad_op_dist_attr = OperatorDistributedAttribute()
+                grad_op_dist_attr.process_mesh = fwd_op_process_mesh
+
+                for input_name in grad_op.input_arg_names:
+                    if input_name not in forward_op.input_arg_names and input_name not in forward_op.output_arg_names:
+                        if input_name in grad_var_to_var[appended_grad_times]:
+                            fwd_name = grad_var_to_var[appended_grad_times][
+                                input_name]
+                            ref_dims_mapping = fwd_op_dist_attr.get_output_dims_mapping(
+                                fwd_name)
+                        else:
+                            input_var = vars[input_name]
+                            ref_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program(
+                                input_var).dims_mapping
+                    else:
+                        if fwd_op_dist_attr.get_input_dims_mapping(input_name):
+                            ref_dims_mapping = fwd_op_dist_attr.get_input_dims_mapping(
+                                input_name)
+                        else:
+                            ref_dims_mapping = fwd_op_dist_attr.get_output_dims_mapping(
+                                input_name)
+                    assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format(
+                        input_name)
+                    grad_op_dist_attr.set_input_dims_mapping(input_name,
+                                                             ref_dims_mapping)
+
+                for output_name in grad_op.output_arg_names:
+                    assert output_name in grad_var_to_var[appended_grad_times]
+                    fwd_name = grad_var_to_var[appended_grad_times][output_name]
+                    ref_dims_mapping = fwd_op_dist_attr.get_input_dims_mapping(
+                        fwd_name)
+                    # var
+                    output_var = vars[output_name]
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_dims_mapping
+                    tensor_dist_attr.process_mesh = fwd_op_process_mesh
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr.set_output_dims_mapping(output_name,
+                                                              ref_dims_mapping)
+
+                self._dist_context.set_op_dist_attr_for_program(
+                    grad_op, grad_op_dist_attr)
+
+            # grad ops that have not a corresponding mapping in grad_op_id_to_op_id
+            else:
+
+                if grad_op.type == 'sum':
+                    assert all(map(_is_grad_var_name, grad_op.input_arg_names))
+                    output_name = grad_op.output_arg_names[0]
+                    assert output_name in grad_var_to_var[appended_grad_times], \
+                        "sum op's output '{}' has no corresponding var".format(
+                        output_name)
+                    ref_fwd_var_name = grad_var_to_var[appended_grad_times][
+                        output_name]
+                    ref_fwd_var = vars[ref_fwd_var_name]
+                    ref_fwd_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        ref_fwd_var)
+                    ref_fwd_dims_mapping = ref_fwd_dist_attr.dims_mapping
+                    ref_fwd_process_mesh = ref_fwd_dist_attr.process_mesh
+                    # output
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_fwd_dims_mapping
+                    tensor_dist_attr.process_mesh = ref_fwd_process_mesh
+                    output_var = vars[output_name]
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr = OperatorDistributedAttribute()
+                    grad_op_dist_attr.process_mesh = ref_fwd_process_mesh
+                    for var_name in grad_op.input_arg_names:
+                        grad_op_dist_attr.set_input_dims_mapping(
+                            var_name, ref_fwd_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(
+                        output_name, ref_fwd_dims_mapping)
+
+                elif grad_op.type == 'fill_zeros_like':
+                    ref_var_name = grad_op.input_arg_names[0]
+                    ref_var = vars[ref_var_name]
+                    ref_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        ref_var)
+                    ref_dims_mapping = ref_dist_attr.dims_mapping
+                    ref_process_mesh = ref_dist_attr.process_mesh
+                    # output
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_dims_mapping
+                    tensor_dist_attr.process_mesh = ref_process_mesh
+                    output_var_name = grad_op.output_arg_names[0]
+                    output_var = vars[output_var_name]
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr = OperatorDistributedAttribute()
+                    grad_op_dist_attr.process_mesh = ref_process_mesh
+                    grad_op_dist_attr.set_input_dims_mapping(ref_var_name,
+                                                             ref_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(output_var_name,
+                                                              ref_dims_mapping)
+
+                elif grad_op.type in ['shape', 'fill_constant']:
+                    continue
+
+                else:
+                    raise ValueError("got unexpect op [{}]".format(
+                        str(grad_op.type)))
+
+                self._dist_context.set_op_dist_attr_for_program(
+                    grad_op, grad_op_dist_attr)
+
     def complete_backward_annotation(self, serial_main_program):
         """Complete the annotation of vars and ops in the backward phase for parallel program."""
 
@@ -689,6 +850,8 @@ def _get_op_by_id(ops, id):
         ops = list(serial_main_program.global_block().ops)
         vars = serial_main_program.global_block().vars
         dist_op_context = self._dist_context.dist_op_context
+        grad_var_to_var = dist_op_context.grad_var_to_var[len(
+            dist_op_context.grad_var_to_var)]
 
         for idx in range(first_backward_op_idx, len(ops)):
 
@@ -765,102 +928,111 @@ def _get_op_by_id(ops, id):
                         grad_op, grad_op_dist_attr)
                     continue
 
-                # op dist attr
-                forward_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
+                fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
                     forward_op)
-                forward_op_process_mesh = forward_op_dist_attr.process_mesh
+                fwd_op_process_mesh = fwd_op_dist_attr.process_mesh
                 grad_op_dist_attr = OperatorDistributedAttribute()
-                grad_op_dist_attr.process_mesh = forward_op_process_mesh
+                grad_op_dist_attr.process_mesh = fwd_op_process_mesh
 
-                # var
                 for input_name in grad_op.input_arg_names:
-                    input_var = vars[input_name]
-                    ref_dims_mapping = None
-                    if "@GRAD" in input_name:
-                        forward_name = _get_forward_varname_from_grad_varname(
-                            input_name)
-                        ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping(
-                            forward_name)
+                    if input_name not in forward_op.input_arg_names and input_name not in forward_op.output_arg_names:
+                        if input_name in grad_var_to_var:
+                            fwd_name = grad_var_to_var[input_name]
+                            ref_dims_mapping = fwd_op_dist_attr.get_output_dims_mapping(
+                                fwd_name)
+                        else:
+                            input_var = vars[input_name]
+                            ref_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program(
+                                input_var).dims_mapping
                     else:
-                        if forward_op_dist_attr.get_input_dims_mapping(
-                                input_name):
-                            ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping(
+                        if fwd_op_dist_attr.get_input_dims_mapping(input_name):
+                            ref_dims_mapping = fwd_op_dist_attr.get_input_dims_mapping(
                                 input_name)
                         else:
-                            ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping(
+                            ref_dims_mapping = fwd_op_dist_attr.get_output_dims_mapping(
                                 input_name)
-
                     assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format(
-                        input_var.name)
+                        input_name)
                     grad_op_dist_attr.set_input_dims_mapping(input_name,
                                                              ref_dims_mapping)
 
-                for output_name in grad_op.desc.output_names():
-                    assert len(grad_op.desc.output(output_name)) in [0, 1]
-                    if _is_grad_var_name(output_name):
-                        input_name = _get_forward_varname_from_grad_varname(
-                            output_name)
-                    else:
-                        assert grad_op.type in [
-                            "cast", "c_identity", "c_allreduce_sum"
-                        ]
-                        input_name = "X"
-                    assert input_name in forward_op.desc.input_names(
-                    ), "var [{}] in op [{}]'s output but could not find [{}] in its forward op".format(
-                        output_name, grad_op.type, input_name)
-                    if len(grad_op.desc.output(output_name)) == 1:
-                        # tensor dist attr
-                        output_var = vars[grad_op.desc.output(output_name)[0]]
-                        forward_name = _get_forward_varname_from_grad_varname(
-                            output_var.name)
-                        ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping(
-                            forward_name)
-
-                        output_var_dist_attr = TensorDistributedAttribute()
-                        output_var_dist_attr.dims_mapping = ref_dims_mapping
-                        output_var_dist_attr.process_mesh = forward_op_process_mesh
-                        self._dist_context.set_tensor_dist_attr_for_program(
-                            output_var, output_var_dist_attr)
-
-                        grad_op_dist_attr.set_output_dims_mapping(
-                            output_var.name, ref_dims_mapping)
+                for output_name in grad_op.output_arg_names:
+                    assert output_name in grad_var_to_var
+                    fwd_name = grad_var_to_var[output_name]
+                    ref_dims_mapping = fwd_op_dist_attr.get_input_dims_mapping(
+                        fwd_name)
+                    # var
+                    output_var = vars[output_name]
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_dims_mapping
+                    tensor_dist_attr.process_mesh = fwd_op_process_mesh
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr.set_output_dims_mapping(output_name,
+                                                              ref_dims_mapping)
 
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
-            # only sum op for merge mutiple version grad has no a corresponding mapping in grad_op_id_to_op_id
+            # grad ops that have not a corresponding mapping in grad_op_id_to_op_id
             else:
-                assert grad_op.type == "sum", "got unexpect op [{}]".format(
-                    str(grad_op.type))
-                assert all(map(_is_grad_var_name, grad_op.input_arg_names))
-                assert len(grad_op.output_arg_names) == 1
-
-                ref_forward_var_name = _get_forward_varname_from_grad_varname(
-                    grad_op.output_arg_names[0])
-                forward_var = vars[ref_forward_var_name]
-                ref_forward_var_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program(
-                    forward_var).dims_mapping
-                ref_forward_var_process_mesh = self._dist_context.get_tensor_dist_attr_for_program(
-                    forward_var).process_mesh
+                if grad_op.type == 'sum':
+                    assert all(map(_is_grad_var_name, grad_op.input_arg_names))
+                    output_name = grad_op.output_arg_names[0]
+                    assert output_name in grad_var_to_var, "sum op's output '{}' has no corresponding var".format(
+                        output_name)
+                    ref_fwd_var_name = grad_var_to_var[output_name]
+                    ref_fwd_var = vars[ref_fwd_var_name]
+                    ref_fwd_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        ref_fwd_var)
+                    ref_fwd_dims_mapping = ref_fwd_dist_attr.dims_mapping
+                    ref_fwd_process_mesh = ref_fwd_dist_attr.process_mesh
+
+                    # output
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_fwd_dims_mapping
+                    tensor_dist_attr.process_mesh = ref_fwd_process_mesh
+                    output_var = vars[output_name]
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
 
-                # output
-                tensor_dist_attr = TensorDistributedAttribute()
-                tensor_dist_attr.dims_mapping = ref_forward_var_dims_mapping
-                tensor_dist_attr.process_mesh = ref_forward_var_process_mesh
-                self._dist_context.set_tensor_dist_attr_for_program(
-                    vars[grad_op.output_arg_names[0]], tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr = OperatorDistributedAttribute()
+                    grad_op_dist_attr.process_mesh = ref_fwd_process_mesh
+                    for var_name in grad_op.input_arg_names:
+                        grad_op_dist_attr.set_input_dims_mapping(
+                            var_name, ref_fwd_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(
+                        output_name, ref_fwd_dims_mapping)
+
+                elif grad_op.type == 'fill_zeros_like':
+                    ref_var_name = grad_op.input_arg_names[0]
+                    ref_var = vars[ref_var_name]
+                    ref_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        ref_var)
+                    ref_dims_mapping = ref_dist_attr.dims_mapping
+                    ref_process_mesh = ref_dist_attr.process_mesh
+                    # output
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_dims_mapping
+                    tensor_dist_attr.process_mesh = ref_process_mesh
+                    output_var_name = grad_op.output_arg_names[0]
+                    output_var = vars[output_var_name]
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr = OperatorDistributedAttribute()
+                    grad_op_dist_attr.process_mesh = ref_process_mesh
+                    grad_op_dist_attr.set_input_dims_mapping(ref_var_name,
+                                                             ref_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(output_var_name,
+                                                              ref_dims_mapping)
+
+                else:
+                    raise ValueError("got unexpect op [{}]".format(
+                        str(grad_op.type)))
 
-                # op
-                grad_op_dist_attr = OperatorDistributedAttribute()
-                grad_op_dist_attr.process_mesh = ref_forward_var_process_mesh
-                for var_name in grad_op.input_arg_names:
-                    assert _get_forward_varname_from_grad_varname(
-                        var_name) == ref_forward_var_name
-                    grad_op_dist_attr.set_input_dims_mapping(
-                        var_name, ref_forward_var_dims_mapping)
-
-                grad_op_dist_attr.set_output_dims_mapping(
-                    grad_op.output_arg_names[0], ref_forward_var_dims_mapping)
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 2807c46540ab1..7e245358d4bcc 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -120,6 +120,11 @@ def dist_main_programs(self):
     def dist_startup_programs(self):
         return self._dist_startup_programs
 
+    @property
+    def is_annotation(self):
+        return len(self._dist_tensors_for_program) or len(
+            self._dist_ops_for_program)
+
     def add_process_mesh(self, process_mesh):
         assert isinstance(process_mesh, ProcessMesh), \
             'The type of dim_mapping must be ProcessMesh.'
@@ -577,6 +582,7 @@ def __init__(self):
         self._cur_src_op = None
         self._cur_dist_attr = None
         self.grad_op_id_to_op_id = {}
+        self.grad_var_to_var = defaultdict(dict)
         self._work_block = None
         self.already_init_sync_vars = set()
         self.varname_mapping = None
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
index 9449b52952cd8..cc08bc1a901b7 100644
--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -16,6 +16,7 @@
 import numpy as np
 import paddle
 from .utils import to_list
+from paddle.fluid.layers.utils import flatten
 from paddle.io import DataLoader, DistributedBatchSampler
 
 
@@ -56,16 +57,17 @@ def __init__(self,
                  data_parallel_world_size=None,
                  data_parallel_rank=None,
                  drop_last=False,
-                 inputs=[]):
+                 sample_generator=True):
         self.feed_list = feed_list
         self.places = places
         self.steps_per_epoch = steps_per_epoch
+        self._sample_generator = sample_generator
+
         super(NonIterableGeneratorLoader, self).__init__(
             dataset, batch_size, epochs, data_parallel_world_size,
             data_parallel_rank, drop_last)
         self._inner_dataloader = self._create_inner_dataloader()
         self._steps = self._infer_steps()
-        self._inputs = inputs
 
     def __iter__(self):
         self._cur_step = 0
@@ -91,27 +93,28 @@ def _infer_steps(self):
         return steps_per_epoch
 
     def _create_inner_dataloader(self):
-        def data_generator():
+        def sample_data_generator():
             batch_data = None
             for step, data in enumerate(self.dataset):
-                if not isinstance(data, list):
-                    data = to_list(data)
-
-                if self.batch_size == 1:
-                    yield data
+                data = flatten(data)
+                if batch_data is None:
+                    batch_data = [[] for i in range(len(data))]
+                for idx in range(len(data)):
+                    batch_data[idx].append(data[idx])
+                if (step + 1) % self.batch_size == 0:
+                    yield batch_data
                     batch_data = None
-                else:
-                    if batch_data is None:
-                        batch_data = [[] for i in range(len(data))]
-
-                    for idx in range(len(data)):
-                        batch_data[idx].append(data[idx])
 
-                    if (step + 1) % self.batch_size == 0:
-                        yield batch_data
-                        batch_data = None
+        def batch_data_generator():
+            for data in self.dataset:
+                data = flatten(data)
+                yield data
 
         dataloader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=self.feed_list, capacity=70, iterable=False)
-        dataloader.set_batch_generator(data_generator, self.places)
+        if self._sample_generator:
+            dataloader.set_batch_generator(sample_data_generator, self.places)
+        else:
+            dataloader.set_batch_generator(batch_data_generator, self.places)
+
         return dataloader
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index a5fec789dfb37..2cd841ef80979 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -17,18 +17,22 @@
 from collections import defaultdict
 
 import paddle
+import paddle.distributed.auto_parallel as auto
+
 from paddle import fluid
 from paddle.io import Dataset
 from paddle.metric import Metric
 from paddle.static import InputSpec
 from paddle.fluid import core
 from paddle.fluid import program_guard
+from paddle.fluid.layers.utils import flatten
+from paddle.fluid.executor import global_scope
 from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import Operator
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.distributed.passes import new_pass, PassContext
 from paddle.distributed.utils import get_logger
+from paddle.distributed.passes import new_pass, PassContext
 
 from .mapper import mapping
 from .cluster import Cluster
@@ -61,6 +65,12 @@ def __init__(self,
         self.strategy = strategy
 
         self._executor = None
+        self._cur_rank = paddle.distributed.get_rank()
+        self._nranks = paddle.distributed.get_world_size()
+        self._saver = DistributedSaver()
+        self._logger = get_logger(logging.INFO)
+
+        self._default_strategy = None
         self._orig_main_prog = fluid.default_main_program()
         self._orig_startup_prog = fluid.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
@@ -70,9 +80,6 @@ def __init__(self,
         self._dist_startup_progs = defaultdict(dict)  # dist startup programs
         self._dist_contexts = {}
         self._pass_contexts = {}
-        self._cur_rank = paddle.distributed.get_rank()
-        self._logger = get_logger(logging.INFO)
-        self._saver = DistributedSaver()
         self._feed_vars = {}
         self._fetch_vars = {}
 
@@ -86,13 +93,11 @@ def prepare(self,
         # TODO: check loss type
         self._loss = loss
         self._metrics = to_list(metrics)
-        for m in ['train', 'predict']:
-            self.mode = m
-            self._build(m)  # build forward program
-            self._plan(m)  # completion & planner
-            self._parallel(m, all_ranks)  # parallel
-            self._initialize(m)  # init comm and startup program
-        self.mode = mode
+        self._mode = mode
+        self._build(mode)  # build forward program
+        self._plan(mode)  # completion & planner
+        self._parallel(mode, all_ranks)  # parallel
+        self._initialize(mode)  # init comm and startup program
 
     def _build(self, mode):
         serial_main_prog = self._serial_main_progs.get(mode, None)
@@ -112,10 +117,16 @@ def _build(self, mode):
             if mode != "predict" and self._loss:
                 losses = to_list(self._loss(*(outputs + labels)))
 
+        default_ctx = get_default_distributed_context()
+        if not default_ctx.is_annotation or self._default_strategy:
+            inputs = [self._set_data_parallel(var) for var in inputs]
+            labels = [self._set_data_parallel(var) for var in labels]
+
+        # print(serial_main_prog)
         self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
 
         self._fetch_vars[mode] = {
-            "outputs": outputs,
+            "outputs": flatten(outputs),
             "loss": losses,
             "metrics": metrics
         }
@@ -128,6 +139,12 @@ def _build(self, mode):
         self._pass_contexts[mode] = PassContext()
 
     def _plan(self, mode):
+
+        # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need
+        # dependency of backward-forward ops in forward completition.
+        defualt_ctx = get_default_distributed_context()
+        self._dist_contexts[mode]._dist_op_context = defualt_ctx.dist_op_context
+
         # Complete the distributed annotation
         serial_main_prog = self._serial_main_progs[mode]
         self._completer = Completer(self._dist_contexts[mode])
@@ -147,13 +164,14 @@ def _parallel(self, mode, all_ranks=False):
                 self._parallel_program(mode, rank)
 
     def _initialize(self, mode):
-        # Traverse different rank programs and traverse each op of them,
-        # instantiate communication by process_mapping.
-        all_process_groups = get_all_process_groups()
-        for process_group in all_process_groups:
-            if self._cur_rank not in process_group.ranks:
-                continue
-            process_group.instantiate()
+        if self._nranks > 1:
+            # Traverse different rank programs and traverse each op of them,
+            # instantiate communication by process_mapping.
+            all_process_groups = get_all_process_groups()
+            for process_group in all_process_groups:
+                if self._cur_rank not in process_group.ranks:
+                    continue
+                process_group.instantiate()
 
         # initialize
         self._place = _get_device()
@@ -161,8 +179,16 @@ def _initialize(self, mode):
             self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
         if self._executor is None:
             self._executor = paddle.static.Executor(self._place)
-        dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
-        self._executor.run(dist_startup_prog)
+            uninitialized = []
+            dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
+            for var in dist_startup_prog.list_vars():
+                scope_var = global_scope().find_var(var.name)
+                if scope_var and scope_var.get_tensor()._is_initialized():
+                    continue
+                uninitialized.append(var)
+            if uninitialized:
+                prune_startup_prog = dist_startup_prog._prune(uninitialized)
+                self._executor.run(prune_startup_prog)
 
     def _parallel_program(self, mode, rank):
         serial_main_program = self._serial_main_progs[mode]
@@ -246,12 +272,13 @@ def _apply_pre_optimization(self, main_program, startup_program, loss,
             if config["use_pure_fp16"]:
                 config["base_opt"] = self._optimizer
                 auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
-                auto_parallel_fp16_pass.apply(
-                    [main_program], [startup_program], self._pass_context)
+                auto_parallel_fp16_pass.apply([main_program],
+                                              [startup_program],
+                                              self._pass_contexts[self.mode])
             else:
                 auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
                 auto_parallel_amp_pass.apply([main_program], [startup_program],
-                                             self._pass_context)
+                                             self._pass_contexts[self.mode])
 
         # apply recompute pass
         if self.strategy.recompute:
@@ -288,18 +315,26 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
                 [main_program], [startup_program],
                 self._pass_contexts[self.mode])
 
-    def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=None):
+    def fit(self,
+            train_data,
+            batch_size=1,
+            epochs=1,
+            steps_per_epoch=None,
+            use_program_cache=False,
+            return_numpy=True,
+            sample_generator=True):
         # TODO: callbacks
         # TODO: evaluate after training
         self.mode = 'train'
-        assert isinstance(train_data, Dataset)
-        train_dataloader = self._create_dataloader(train_data, batch_size,
-                                                   epochs, steps_per_epoch)
+        assert self.mode in self._dist_main_progs, "train model is not ready, please call `engine.prepare(mode='train')` first."
+        train_dataloader = self._create_dataloader(
+            train_data, batch_size, epochs, steps_per_epoch, sample_generator)
 
         outputs = []
         for epoch in range(epochs):
             for step, data in enumerate(train_dataloader):
-                logs, loss = self._train_step(data)
+                logs, loss = self._train_step(data, use_program_cache,
+                                              return_numpy)
                 outputs.append(loss)
                 train_logs = {
                     "train_" + name: val
@@ -308,14 +343,35 @@ def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=None):
                 self._logger.info(train_logs)
         return outputs
 
+    def evaluate(self,
+                 eval_data,
+                 batch_size=1,
+                 use_program_cache=False,
+                 return_numpy=True,
+                 sample_generator=True):
+        self.mode = 'eval'
+        assert self.mode in self._dist_main_progs, "eval model is not ready, please call `engine.prepare(mode='eval')` first."
+        eval_dataloader = self._create_dataloader(
+            eval_data, batch_size, sample_generator=sample_generator)
+
+        outputs = []
+        for step, data in enumerate(eval_dataloader):
+            logs, outs = self._eval_step(data, use_program_cache, return_numpy)
+            outputs.append(outs)
+            predict_logs = {"eval_" + name: val for name, val in logs.items()}
+            self._logger.info(predict_logs)
+        return outputs
+
     def predict(self,
                 test_data,
                 batch_size=1,
                 use_program_cache=False,
-                return_numpy=True):
+                return_numpy=True,
+                sample_generator=True):
         self.mode = 'predict'
-        # TODO: need check dataset
-        test_dataloader = self._create_dataloader(test_data, batch_size)
+        assert self.mode in self._dist_main_progs, "predict model is not ready, please call `engine.prepare(mode='predict')` first."
+        test_dataloader = self._create_dataloader(
+            test_data, batch_size, sample_generator=sample_generator)
 
         outputs = []
         for step, data in enumerate(test_dataloader):
@@ -329,19 +385,39 @@ def predict(self,
             self._logger.info(predict_logs)
         return outputs
 
-    def _train_step(self, data):
+    def _train_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
         dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
         fetch_var = self._fetch_vars[self.mode]["loss"][0]
         if fetch_var.name not in dist_main_prog.global_block().vars:
-            loss = self._executor.run(dist_main_prog)
+            loss = self._executor.run(dist_main_prog,
+                                      use_program_cache=use_program_cache)
             logs["loss"] = None
         else:
             loss = self._executor.run(dist_main_prog,
-                                      fetch_list=to_list(fetch_var))
+                                      fetch_list=to_list(fetch_var),
+                                      use_program_cache=use_program_cache,
+                                      return_numpy=return_numpy)
             logs["loss"] = loss
         return logs, loss
 
+    def _eval_step(self, data, use_program_cache=False, return_numpy=True):
+        logs = {}
+        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
+        fetch_var = self._fetch_vars[self.mode]["loss"][0]
+
+        if fetch_var.name not in dist_main_prog.global_block().vars:
+            outs = self._executor.run(dist_main_prog,
+                                      use_program_cache=use_program_cache)
+            logs["loss"] = outs
+        else:
+            outs = self._executor.run(dist_main_prog,
+                                      fetch_list=fetch_var,
+                                      use_program_cache=use_program_cache,
+                                      return_numpy=return_numpy)
+            logs["loss"] = outs
+        return logs, outs
+
     def _predict_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
         dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
@@ -366,7 +442,8 @@ def _create_dataloader(self,
                            dataset,
                            batch_size,
                            epochs=1,
-                           steps_per_epoch=None):
+                           steps_per_epoch=None,
+                           sample_generator=True):
         feed_list = self._feed_vars[self.mode]["inputs"] + self._feed_vars[
             self.mode]["labels"]
         dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
@@ -376,9 +453,12 @@ def _create_dataloader(self,
         serial_main_prog = self._serial_main_progs[self.mode]
         serial_main_block = serial_main_prog.global_block()
         op_size = len(dist_main_block.ops)
+        if dist_main_block.ops[0].type == 'create_py_reader':
+            op_size -= 3
+            for _ in range(3):
+                dist_main_block._remove_op(0, sync=False)
         places = paddle.static.cuda_places()
         with fluid.program_guard(dist_main_prog, dist_startup_prog):
-            inputs = self._feed_vars[self.mode]["inputs"]
             dataloader = NonIterableGeneratorLoader(
                 dataset,
                 feed_list,
@@ -386,7 +466,7 @@ def _create_dataloader(self,
                 batch_size,
                 epochs,
                 steps_per_epoch,
-                inputs=inputs)
+                sample_generator=sample_generator)
         new_op_size = len(dist_main_block.ops)
         for _ in range(new_op_size - 1, op_size - 1, -1):
             op = dist_main_block.ops[new_op_size - 1]
@@ -396,7 +476,7 @@ def _create_dataloader(self,
                 dist_main_block, new_op_desc, type=new_op_desc.type())
             dist_main_block.ops.insert(0, new_op)
             for in_name in new_op.input_arg_names:
-                if in_name == "lod_tensor_blocking_queue_0":
+                if "lod_tensor_blocking_queue" in in_name:
                     continue
                 if in_name not in dist_main_block.vars:
                     in_var = serial_main_block._var_recursive(in_name)
@@ -424,6 +504,27 @@ def _validate_spec(self, specs):
                         .format(i, spec))
         return specs
 
+    def _set_data_parallel(self, var):
+        if self._nranks == 1:
+            self._default_strategy = 'serial'
+            auto.shard_tensor(
+                var,
+                dist_attr={
+                    "process_mesh": [0],
+                    "dims_mapping": [-1 for _ in range(len(var.shape))]
+                })
+        else:
+            self._default_strategy = 'dp'
+            auto.shard_tensor(
+                var,
+                dist_attr={
+                    "process_mesh": list(range(self._nranks)),
+                    "dims_mapping":
+                    [0] + [-1 for _ in range(len(var.shape) - 1)]
+                })
+
+        return var
+
     def save(self, path, training=True, mode=None):
         if not mode:
             mode = self.mode
@@ -459,3 +560,35 @@ def load(self, path, strict=True, load_optimizer=True, mode=None):
         dist_context = self._dist_contexts[mode]
         self._saver.load(path, dist_main_prog, dist_context, strict,
                          load_optimizer)
+
+    @property
+    def mode(self):
+        return self._mode
+
+    @mode.setter
+    def mode(self, mode):
+        self._mode = mode
+
+    @property
+    def metrics(self):
+        return self._metrics
+
+    @property
+    def main_program(self):
+        return self._dist_main_progs[self.mode][self._cur_rank]
+
+    @property
+    def startup_program(self):
+        return self._dist_startup_progs[self.mode][self._cur_rank]
+
+    @property
+    def dist_context(self):
+        return self._dist_contexts[self.mode]
+
+    @property
+    def serial_main_program(self):
+        return self._serial_main_progs[self.mode]
+
+    @property
+    def serial_startup_program(self):
+        return self._serial_startup_progs[self.mode]
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 9fb200f4d2db9..4795050d15dcc 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -53,6 +53,10 @@ def __init__(self, name):
     def is_input_compatible(self, dist_op):
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
+        input_names = op_desc.input_names()
+        xshape_arg_names = []
+        if "XShape" in input_names:
+            xshape_arg_names = op_desc.input("XShape")
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
@@ -63,10 +67,18 @@ def is_input_compatible(self, dist_op):
                 # continue
                 # if len(dims_mapping) < 1:
                 #     continue
-            if len(dims_mapping) > 1:
-                for mapping in dims_mapping[1:]:
-                    if mapping != -1:
-                        return False
+            if arg_name not in xshape_arg_names:
+                if len(dims_mapping) > 1:
+                    for mapping in dims_mapping[1:]:
+                        if mapping != -1:
+                            return False
+            else:
+                if dims_mapping[0] != -1:
+                    return False
+                if len(dims_mapping) > 2:
+                    for mapping in dims_mapping[2:]:
+                        if mapping != -1:
+                            return False
         return True
 
     def is_output_compatible(self, dist_op):
@@ -105,17 +117,31 @@ def is_auto_compatible(self, dist_op):
         op_dist_attr = dist_op.dist_attr
         batch_dim_mappings = []
         # Check input compatibility
+        input_names = op_desc.input_names()
+        xshape_arg_names = []
+        if "XShape" in input_names:
+            xshape_arg_names = op_desc.input("XShape")
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if len(dims_mapping) > 1:
-                for mapping in dims_mapping[1:]:
-                    if mapping != -1:
-                        return False
-            if len(dims_mapping) >= 1:
-                batch_dim_mappings.append(dims_mapping[0])
+            if arg_name not in xshape_arg_names:
+                if len(dims_mapping) > 1:
+                    for mapping in dims_mapping[1:]:
+                        if mapping != -1:
+                            return False
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
+            else:
+                if dims_mapping[0] != -1:
+                    return False
+                if len(dims_mapping) > 2:
+                    for mapping in dims_mapping[2:]:
+                        if mapping != -1:
+                            return False
+                if len(dims_mapping) >= 2:
+                    batch_dim_mappings.append(dims_mapping[1])
 
         # Check output compatibility
         output_names = op_desc.output_names()
@@ -160,24 +186,39 @@ def update_dims_mapping(self, dist_op):
             or op_desc.type() == "slice" \
                 or op_desc.type() == "while":
             return False
+
+        input_names = op_desc.input_names()
+        input_xshape_arg_names = []
+        if "XShape" in input_names:
+            input_xshape_arg_names = op_desc.input("XShape")
+
         output_names = op_desc.output_names()
-        xshape_arg_names = []
+        output_xshape_arg_names = []
         if "XShape" in output_names:
-            xshape_arg_names = op_desc.output("XShape")
+            output_xshape_arg_names = op_desc.output("XShape")
+
         batch_dim_mappings = []
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if len(dims_mapping) >= 1:
-                batch_dim_mappings.append(dims_mapping[0])
+            if arg_name not in input_xshape_arg_names:
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
+            else:
+                batch_dim_mappings.append(dims_mapping[1])
         for arg_name in op_desc.output_arg_names():
+            if op_desc.type() == "fill_zeros_like":
+                input_tensor = dist_op.get_serial_input(op_desc.input_arg_names(
+                )[0])
+                if input_tensor.is_parameter:
+                    continue
             serial_tensor = dist_op.get_serial_output(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            if arg_name not in xshape_arg_names:
+            if arg_name not in output_xshape_arg_names:
                 if len(dims_mapping) >= 1:
                     batch_dim_mappings.append(dims_mapping[0])
             else:
@@ -194,16 +235,27 @@ def update_dims_mapping(self, dist_op):
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if len(dims_mapping
-                   ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
-                dims_mapping[0] = compatible_dim_mapping
-                changed = True
+            if arg_name not in input_xshape_arg_names:
+                if len(dims_mapping) >= 1 and \
+                    compatible_dim_mapping != dims_mapping[0]:
+                    dims_mapping[0] = compatible_dim_mapping
+                    changed = True
+            else:
+                if len(dims_mapping) >= 2 and \
+                    compatible_dim_mapping != dims_mapping[1]:
+                    dims_mapping[1] = compatible_dim_mapping
+                    changed = True
         for arg_name in op_desc.output_arg_names():
+            if op_desc.type() == "fill_zeros_like":
+                input_tensor = dist_op.get_serial_input(op_desc.input_arg_names(
+                )[0])
+                if input_tensor.is_parameter:
+                    continue
             serial_tensor = dist_op.get_serial_output(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            if arg_name not in xshape_arg_names:
+            if arg_name not in output_xshape_arg_names:
                 if len(dims_mapping
                        ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                     dims_mapping[0] = compatible_dim_mapping
@@ -371,30 +423,14 @@ def backward(ctx, *args, **kwargs):
 
         if need_gradient_allreduce:
             allreduce_vars = []
-            for input_name in backward_op.desc.input_names():
-                for varname in backward_op.desc.input(input_name):
-                    if "@GRAD" not in varname and is_parameter_related(
-                            varname, main_block):
-                        # NOTE: When amp and recompute pass are effective at the same time,
-                        # if a parameter is casted and recomputed, the 'parameter@GARD' can not
-                        # be found in the grad_op's output.
-                        if "subprog_" in varname:
-                            varname = varname[:varname.index(".subprog_")]
-
-                        assert len(
-                            backward_op.desc.input(input_name)
-                        ) == 1, "parameter input to grad op should be length 1, but got [{}]".format(
-                            backward_op.desc.input(input_name))
-
-                        assert varname + "@GRAD" in backward_op.desc.output_arg_names(
-                        ), "parameter's grad [{}] not found in the grad op's output".format(
-                            varname + "@GRAD")
-                        assert len(
-                            backward_op.desc.output(input_name + "@GRAD")
-                        ) == 1, "parameter grad of grad op should be length 1, but got [{}]".format(
-                            backward_op.desc.output(input_name + "@GRAD"))
-                        allreduce_vars.append(
-                            backward_op.desc.output(input_name + "@GRAD")[0])
+            for output_name in backward_op.desc.output_names():
+                for varname in backward_op.desc.output(output_name):
+                    if varname in kwargs["grad_var_to_var"]:
+                        fwd_name = kwargs["grad_var_to_var"][varname]
+                        if fwd_name not in main_block.vars:
+                            continue
+                        if is_parameter_related(fwd_name, main_block):
+                            allreduce_vars.append(varname)
 
             if len(allreduce_vars) > 0:
 
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index c03ef9c06d80f..fe091cd08b72b 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -25,7 +25,7 @@
 from .dist_attribute import OperatorDistributedAttribute
 from .process_group import new_process_group
 from .utils import set_dist_op_desc_original_id
-from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op
+from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op
 from .operators.common import BACKWARD_ONLY_DIST_OPS
 
 __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
@@ -198,15 +198,29 @@ def partition_block(self, ref_block, target_block):
         dist_op_context = self._dist_context.dist_op_context
         serial_ops = ref_block.ops
 
+        last_fwd_op_idx = -1
+        for idx, op in enumerate(ref_block.ops):
+            if is_loss_op(op):
+                last_fwd_op_idx = idx
+                break
+
+        if last_fwd_op_idx == -1:
+            last_fwd_op_idx = len(ref_block.ops)
+
         # init mapping
         forward_op_id2forward_op = {}
         for idx in range(len(serial_ops)):
-            if is_forward_op(serial_ops[idx]):
+            if idx <= last_fwd_op_idx:
                 forward_op_id2forward_op[serial_ops[idx].desc.id(
                 )] = serial_ops[idx]
 
+        appended_grad_times = 0
         # partiiton
-        for op in serial_ops:
+        for idx, op in enumerate(serial_ops):
+
+            if is_backward_op(op) and (is_forward_op(serial_ops[idx - 1]) or
+                                       is_loss_op(serial_ops[idx - 1])):
+                appended_grad_times += 1
 
             # partititon input variables
             for serial_input_varname in op.desc.input_arg_names():
@@ -244,8 +258,11 @@ def partition_block(self, ref_block, target_block):
                 kinputs, koutputs = dist_op_context.prepare_context(op)
                 dist_op_backward_impl = _get_dist_op_backward_implement(
                     op, self._dist_context, forward_op_id2forward_op)
-                dist_op_backward_impl.backward(self._dist_context, **kinputs,
-                                               **koutputs)
+                grad_var_to_var = self._dist_context.dist_op_context.grad_var_to_var[
+                    appended_grad_times]
+                dist_op_backward_impl.backward(
+                    self._dist_context, **kinputs, **koutputs,
+                    **{"grad_var_to_var": grad_var_to_var})
             else:
                 raise NotImplementedError(
                     "partitioner only support forward op and backward op, but got {}".
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index fc85cd04d4010..9c40034498dbc 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -996,69 +996,87 @@ def set_grad_var_shape(program, dist_context):
 
     block = program.global_block()
     vars = block.vars
-    for op in block.ops:
+    appended_grad_times = 0
+    grad_var_to_var = dist_context.dist_op_context.grad_var_to_var
+
+    for idx, op in enumerate(block.ops):
+
+        if int(op.attr('op_role')) != int(OpRole.Backward):
+            continue
+
+        if int(block.ops[idx-1].attr('op_role')) == int(OpRole.Forward) or \
+            int(block.ops[idx-1].attr('op_role')) == 257:
+            appended_grad_times += 1
 
         if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
             break
 
-        if op.type in ["sum", "concat"]:
+        if op.type in ["sum", "concat", "shape"]:
             continue
-        if int(op.attr('op_role')) == int(OpRole.Backward):
-            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-            assert op_dist_attr is not None
 
-            for var_name in op.output_arg_names:
-                if "@GRAD" not in var_name:
-                    continue
+        op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+        assert op_dist_attr is not None
+
+        for var_name in op.output_arg_names:
+
+            if "@GRAD" not in var_name:
+                continue
+            if var_name in grad_var_to_var[appended_grad_times]:
+                forward_var_name = grad_var_to_var[appended_grad_times][
+                    var_name]
+            else:
                 forward_var_name = var_name[:var_name.find("@GRAD")]
-                if op.type in [
-                        "c_allreduce_sum", "c_identity", "scale", "cast"
-                ]:
-                    forward_var_name = op.input_arg_names[0]
-                elif op.type == "matmul_v2_grad":
-                    forward_var_name = None
-                    for output_name in op.output_names:
-                        if var_name in op.output(output_name):
-                            assert "@GRAD" in output_name
-                            input_name = output_name[:output_name.find("@GRAD")]
-                            assert len(op.input(input_name)) == 1
-                            forward_var_name = op.input(input_name)[0]
-                    assert forward_var_name is not None
-
-                need_set_shape_list = [
-                    "reshape2_grad", "softmax_with_cross_entropy_grad",
-                    "transpose2_grad", "softmax_grad", "cross_entropy_grad2",
-                    "dropout_grad"
-                ]
-                forward_list = [
-                    "reshape2", "softmax_with_cross_entropy", "transpose2",
-                    "softmax", "cross_entropy2", "dropout"
-                ]
-                if op.type in need_set_shape_list:
-                    for forward_op in block.ops:
-                        assert int(forward_op.attr('op_role')) != int(
-                            OpRole.Backward)
-                        idx = need_set_shape_list.index(op.type)
-                        forward_op_name = forward_list[idx]
-                        if forward_op.type == forward_op_name and forward_var_name in forward_op.input_arg_names:
-                            op_dist_attr = dist_context.get_op_dist_attr_for_program(
-                                forward_op)
-                            break
-
-                forward_input_dist_attr = op_dist_attr.get_input_dist_attr(
-                    forward_var_name)
-                assert forward_input_dist_attr is not None, f"{forward_var_name, str(op)}"
-                forward_var = vars[forward_var_name]
-                forward_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
-                    forward_var)
-                assert forward_var_dist_attr is not None
-                grad_var = vars[var_name]
-                ref_shape = infer_shape(block, forward_var,
-                                        forward_var_dist_attr,
-                                        forward_input_dist_attr)
-
-                if list(grad_var.shape) != ref_shape:
-                    grad_var.desc.set_shape(ref_shape)
+
+            if op.type in [
+                    "c_allreduce_sum", "c_identity", "scale", "cast",
+                    "fill_zeros_like"
+            ]:
+                forward_var_name = op.input_arg_names[0]
+            elif op.type == "matmul_v2_grad":
+                forward_var_name = None
+                for output_name in op.output_names:
+                    if var_name in op.output(output_name):
+                        assert "@GRAD" in output_name
+                        input_name = output_name[:output_name.find("@GRAD")]
+                        assert len(op.input(input_name)) == 1
+                        forward_var_name = op.input(input_name)[0]
+                assert forward_var_name is not None
+
+            need_set_shape_list = [
+                "reshape2_grad", "softmax_with_cross_entropy_grad",
+                "transpose2_grad", "softmax_grad", "cross_entropy_grad2",
+                "dropout_grad", "tanh_grad", "slice", "assign",
+                "matmul_v2_triple_grad", "elementwise_add_triple_grad",
+                "fill_constant", "sqrt_grad"
+            ]
+            forward_list = [
+                "reshape2", "softmax_with_cross_entropy", "transpose2",
+                "softmax", "cross_entropy2", "dropout", "tanh",
+                ["slice_grad", "c_allgather"], "assign", "matmul_v2_grad_grad",
+                "elementwise_add_grad_grad", "shape", "sqrt"
+            ]
+            if op.type in need_set_shape_list:
+                for forward_op in block.ops:
+                    idx = need_set_shape_list.index(op.type)
+                    forward_op_name = forward_list[idx]
+                    if forward_op.type in forward_op_name and forward_var_name in forward_op.input_arg_names:
+                        op_dist_attr = dist_context.get_op_dist_attr_for_program(
+                            forward_op)
+                        break
+
+            forward_input_dist_attr = op_dist_attr.get_input_dist_attr(
+                forward_var_name)
+            assert forward_input_dist_attr is not None, f"{forward_var_name, str(op)}"
+            forward_var = vars[forward_var_name]
+            forward_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                forward_var)
+            assert forward_var_dist_attr is not None
+            grad_var = vars[var_name]
+            ref_shape = infer_shape(block, forward_var, forward_var_dist_attr,
+                                    forward_input_dist_attr)
+
+            if list(grad_var.shape) != ref_shape:
+                grad_var.desc.set_shape(ref_shape)
 
 
 OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index b92b2a3c15dec..e33a3dba669ab 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -263,7 +263,7 @@ def _new_process_group_impl(backend,
             rank=global_rank,
             world_size=global_world_size,
             place=place,
-            gid=0,
+            gid=group_id,
             local_rank=rank,
             local_size=world_size,
             gloo_rank=cluster_id,
@@ -350,18 +350,19 @@ def new_group(ranks=None, backend=None):
         global _default_group_name
         gid = _new_ring_id()
         group_name = _default_group_name + str(gid)
-        global_group = _get_default_group()
-        global_rank = global_group.rank
-        global_ranks = global_group.ranks
-        backend = _default_backend if backend is None else backend
-        if ranks is None:
-            ranks = global_ranks
-        assert len(ranks) <= len(global_ranks), (
-            "Size of new group must be less than or "
-            "equal to that of the default global group.")
+        if ranks is None or len(ranks) > 1:
+            global_group = _get_default_group()
+            global_rank = global_group.rank
+            global_ranks = global_group.ranks
+            backend = _default_backend if backend is None else backend
+            if ranks is None:
+                ranks = global_ranks
+            assert len(ranks) <= len(global_ranks), (
+                "Size of new group must be less than or "
+                "equal to that of the default global group.")
         size = len(ranks)
         ranks = sorted(ranks)
-        if global_rank in ranks and size > 1:
+        if size > 1 and global_rank in ranks:
             rank = ranks.index(global_rank)
             pg = _new_process_group_impl(
                 backend,
@@ -642,6 +643,8 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
             op_type = core.ReduceOp.MAX
         elif op == ReduceOp.MIN:
             op_type = core.ReduceOp.MIN
+        elif op == ReduceOp.PROD:
+            op_type = core.ReduceOp.PRODUCT
         else:
             raise ValueError("Unknown reduce_op type for allreduce.")
         group = _get_default_group() if group is None else group
@@ -744,6 +747,8 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
             op_type = core.ReduceOp.MAX
         elif op == ReduceOp.MIN:
             op_type = core.ReduceOp.MIN
+        elif op == ReduceOp.PROD:
+            op_type = core.ReduceOp.PRODUCT
         else:
             raise ValueError("Unknown reduce_op type for reduce.")
         group = _get_default_group() if group is None else group
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
index 6fb6a5ca32b3c..ea3dc43d0c712 100644
--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -14,8 +14,9 @@
 
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle import _C_ops
 
 
 def _number_count(numbers, upper_range):
@@ -40,7 +41,9 @@ def _number_count(numbers, upper_range):
             number_count = paddle.distributed.utils.number_count(numbers, upper_range)
             print(number_count) # the result: [2, 0, 2, 0, 0, 0]
     """
-    if _non_static_mode():
+    if in_dygraph_mode():
+        return _C_ops.number_count(numbers, 'upper_range', upper_range)
+    elif _in_legacy_dygraph():
         return core.ops.number_count(numbers, 'upper_range', upper_range)
     else:
         op_type = 'number_count'
@@ -86,7 +89,9 @@ def _assign_pos(x, cum_count):
             pos = paddle.distributed.utils.assign_pos(x=numbers, cum_count=num_cum)
             print(pos) # the result: (2, 0, 3, 1)
     """
-    if _non_static_mode():
+    if in_dygraph_mode():
+        return _C_ops.assign_pos(x, cum_count, cum_count[-1])
+    elif _in_legacy_dygraph():
         return core.ops.assign_pos(x, cum_count, cum_count[-1])
     else:
         op_type = 'assign_pos'
@@ -120,7 +125,9 @@ def _random_routing(topk_idx, topk_value, prob, topk=2):
             prob: random prob, shape=(topk_idx.shape[0],)
     """
     if topk == 2:
-        if _non_static_mode():
+        if in_dygraph_mode():
+            return _C_ops.random_routing(prob, topk_value, topk_idx)
+        elif _in_legacy_dygraph():
             return core.ops.random_routing(prob, topk_value, topk_idx)
         else:
             raise RuntimeError("Not supporting static mode now")
@@ -149,7 +156,10 @@ def _limit_by_capacity(expert_count, capacity, n_worker):
             out = paddle.distributed.utils.limit_by_capacity(expert_count, capacity, n_work)
             print(out) # the result: [1, 2, 2, 4, 3, 3]
     """
-    if _non_static_mode():
+    if in_dygraph_mode():
+        return _C_ops.limit_by_capacity(expert_count, capacity, 'n_worker',
+                                        n_worker)
+    elif _in_legacy_dygraph():
         return core.ops.limit_by_capacity(expert_count, capacity, 'n_worker',
                                           n_worker)
     else:
@@ -192,8 +202,10 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
             # Tensor(shape=[8], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
               [1, 3, 3, 3, -1, 2, 1, 1])
     """
-
-    if _non_static_mode():
+    if in_dygraph_mode():
+        return _C_ops.prune_gate_by_capacity(gate_idx, expert_count, "n_expert",
+                                             n_expert, "n_worker", n_worker)
+    elif _in_legacy_dygraph():
         return core.ops.prune_gate_by_capacity(
             gate_idx, expert_count, "n_expert", n_expert, "n_worker", n_worker)
     check_variable_and_dtype(gate_idx, 'GateIdx', ['int32', 'int64'],
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index f0365cab8c896..53d35a251c8c8 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -219,8 +219,9 @@ def train():
             "required to create a process group.")
         master_addr = os.getenv("MASTER_ADDR", None)
         master_port = os.getenv("MASTER_PORT", None)
-        endpoints = None
-        if not master_addr or not master_port:
+        endpoints = ":".join(
+            [master_addr, master_port]) if master_addr and master_port else None
+        if endpoints is None:
             endpoints = os.getenv("PADDLE_MASTER", None)
         if endpoints is None:
             endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')[0]
diff --git a/python/paddle/distribution/multinomial.py b/python/paddle/distribution/multinomial.py
index c4110040fd192..837eb53eab1ea 100644
--- a/python/paddle/distribution/multinomial.py
+++ b/python/paddle/distribution/multinomial.py
@@ -16,6 +16,10 @@
 
 import paddle
 from paddle.distribution import categorical, distribution
+try:
+    from collections.abc import Iterable
+except:
+    from collections import Iterable
 
 
 class Multinomial(distribution.Distribution):
@@ -138,7 +142,7 @@ def sample(self, shape=()):
         Args:
             sample_shape (tuple, optional): [description]. Defaults to ().
         """
-        if not isinstance(shape, collections.Iterable):
+        if not isinstance(shape, Iterable):
             raise TypeError('sample shape must be Iterable object.')
 
         samples = self._categorical.sample([self.total_count, ] + list(shape))
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index c7e69753b5335..bc53c130286aa 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -28,6 +28,10 @@
 import paddle.fluid
 from .data_feeder import check_type
 import warnings
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 __all__ = [
     'append_backward',
     'gradients',
@@ -474,12 +478,16 @@ def _accumulate_gradients_by_add_ops_(var_name,
     renamed_vars[var_name] = [var_name]
 
 
-def _addup_repetitive_outputs_(op_descs, block_idx):
+def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
     """
     In backward part, an variable may be the output of more than one ops.
     And one op may yield its multiple outputs to the same variable.
     In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
+
+    Args:
+        grad_var_to_var(dict): used to build the mapping between grad var name and forward var name.
+        Only for auto parallel.
     """
     _MAX_ADD_NUM_ = framework._global_flags()['FLAGS_max_inplace_grad_add']
     #pending_sum_ops = []
@@ -527,6 +535,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
                         new_name = var_name + "@RENAME@block" + str(block_idx) + "@" + \
                             str(var_rename_count[var_name])
                         var_rename_count[var_name] += 1
+                        # Build the mapping between the new_name and var_name (Only for auto parallel)
+                        if grad_var_to_var is not None:
+                            if var_name in grad_var_to_var:
+                                grad_var_to_var[new_name] = grad_var_to_var[
+                                    var_name]
+                            else:
+                                grad_var_to_var[new_name] = var_name
                         # rename original var_name
                         renamed_vars[var_name][0] = new_name
                         # before change: _rename_arg_(op_descs, var_name,
@@ -553,6 +568,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
                     new_name = var_name + "@RENAME@block" + str(block_idx) + "@" + \
                         str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
+                    # Build the mapping between the new_name and var_name (Only for auto parallel)
+                    if grad_var_to_var is not None:
+                        if var_name in grad_var_to_var:
+                            grad_var_to_var[new_name] = grad_var_to_var[
+                                var_name]
+                        else:
+                            grad_var_to_var[new_name] = var_name
                     arg_names[arg_idx] = new_name
                     op_desc.set_output(param_name, arg_names)
                     renamed_vars[var_name].append(new_name)
@@ -1077,6 +1099,16 @@ def _append_backward_ops_(block,
         rename_var_map(dict): used to associate target_grad var name with first grad_op input name.
             Only used in for high order gradient.
     """
+
+    # Build the mapping between the forward op and backward op (Only for auto parallel)
+    def update_distop_context(distop_context, op_grad_to_var,
+                              appending_grad_times):
+        distop_context.grad_var_to_var[appending_grad_times].update(
+            op_grad_to_var)
+        for op_desc in grad_op_desc:
+            assert op_desc.id() not in distop_context.grad_op_id_to_op_id
+            distop_context.grad_op_id_to_op_id[op_desc.id()] = op.desc.id()
+
     if callbacks is not None:
         assert (isinstance(callbacks, (list, tuple)))
         for cb in callbacks:
@@ -1114,11 +1146,18 @@ def _append_backward_ops_(block,
         # Getting op's corresponding grad_op
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
             op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
+
         # Build the mapping between the forward op and backward op (Only for auto parallel)
         if distop_context is not None:
-            for op_desc in grad_op_desc:
-                assert op_desc.id() not in distop_context.grad_op_id_to_op_id
-                distop_context.grad_op_id_to_op_id[op_desc.id()] = op.desc.id()
+            update_distop_context(distop_context, op_grad_to_var,
+                                  program._appending_grad_times)
+        else:
+            default_ctx = getattr(paddle.distributed.auto_parallel.dist_context,
+                                  '_g_default_distributed_context', None)
+            if default_ctx is not None:
+                distop_context = default_ctx.dist_op_context
+                update_distop_context(distop_context, op_grad_to_var,
+                                      program._appending_grad_times)
 
         # Set device for grad_op according to forward Op
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
@@ -1151,6 +1190,11 @@ def _append_backward_ops_(block,
                         rename_var_map[name] = new_name
 
                         if name in op_grad_to_var:
+                            # Build the mapping between the grad var name and var name (Only for auto parallel)
+                            if distop_context is not None:
+                                distop_context.grad_var_to_var[
+                                    program._appending_grad_times][
+                                        new_name] = op_grad_to_var[name]
                             op_grad_to_var[new_name] = op_grad_to_var[name]
                             op_grad_to_var.pop(name)
 
@@ -1183,8 +1227,14 @@ def _append_backward_ops_(block,
             grad_op_descs.extend(grad_op_desc)
             grad_to_var.update(op_grad_to_var)
 
+    # record mapping bewteen grad var name and var name (Only for auto parallel)
+    grad_var_to_var = None
+    if distop_context is not None:
+        grad_var_to_var = distop_context.grad_var_to_var[
+            program._appending_grad_times]
     # sum parameter's gradients' var given multiple var gradient
-    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs, block.idx)
+    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs, block.idx,
+                                               grad_var_to_var)
 
     # if all outputs of the grad op are in no_grad_set, then just remove and fill zero
     # if all inputs of the grad op are in no_grad_set, just remove this op
@@ -1722,7 +1772,7 @@ def append_backward(loss,
 def _as_list(x):
     if x is None:
         return []
-    return list(x) if isinstance(x, collections.Sequence) else [x]
+    return list(x) if isinstance(x, Sequence) else [x]
 
 
 def _is_ancestor_block(ancestor_block, block):
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index d21b7e4740a6e..47c64ff8bd605 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -1001,9 +1001,6 @@ def compile(self, feed_list, fetch_list):
             a_pass.set('custom_ops', self._custom_op_names)
         a_pass.apply(self._graph)
 
-        a_pass = core.get_pass("transfer_cast_op_pass")
-        a_pass.apply(self._graph)
-
         passes = [
             'ipu_inplace_pass',
             'ipu_graph_builder_pass',
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
index 588eb2a29f555..c5b9b9e71f6be 100644
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -129,9 +129,13 @@ def update_loss_scaling(x,
         'decr_every_n_nan_or_inf': decr_every_n_nan_or_inf,
         'incr_ratio': incr_ratio,
         'decr_ratio': decr_ratio,
-        'stop_update': stop_update
     }
 
+    if isinstance(stop_update, Variable):
+        inputs['StopUpdate'] = stop_update
+    else:
+        attrs['stop_update'] = stop_update
+
     helper.append_op(
         type='update_loss_scaling', inputs=inputs, outputs=outputs, attrs=attrs)
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index c6e2bcb8b1a24..c3720396e1d77 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -432,7 +432,7 @@ def _add_dynamic_loss_scaling(self, params_grads, found_inf):
                     self._decr_every_n_nan_or_inf,
                     self._incr_ratio,
                     self._decr_ratio,
-                    stop_update=False,
+                    stop_update=self._optimizer._get_stop_update_var(),
                     name="update_loss_scaling")
             return
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 9dba5d658dfc9..7b2546f70ad1b 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -162,6 +162,7 @@ def _update_list(self):
     'split',
     'fused_feedforward',
     'fused_attention',
+    'fused_multi_transformer',
 }
 
 # The set of ops that don't support fp16 calculation
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 760e9ceb9ea2f..0100866806cdc 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -109,6 +109,8 @@ def _keep_fp32_input(op, in_name):
         return in_name in {
             'LnScale', 'LnBias', 'Ln2Scale', 'Ln2Bias', "Ln1Scale", "Ln1Bias"
         }
+    if op_type == 'fused_multi_transformer':
+        return in_name in {'LnScale', 'LnBias', 'FFNLnScale', 'FFNLnBias'}
     return False
 
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
index 14282df23d365..1f7a01f17b066 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
@@ -28,6 +28,27 @@ def forward(self, input):
         return input
 
 
+def fuse_conv_bn(model):
+    is_train = False
+    if model.training:
+        model.eval()
+        is_train = True
+    fuse_list = []
+    tmp_pair = [None, None]
+    for name, layer in model.named_sublayers():
+        if isinstance(layer, nn.Conv2D):
+            tmp_pair[0] = name
+        if isinstance(layer, nn.BatchNorm2D):
+            tmp_pair[1] = name
+
+        if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2:
+            fuse_list.append(tmp_pair)
+            tmp_pair = [None, None]
+    model = fuse_layers(model, fuse_list)
+    if is_train:
+        model.train()
+
+
 def fuse_layers(model, layers_to_fuse, inplace=False):
     '''
        fuse layers in layers_to_fuse
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 059cb7b0dd1bf..d5c3d9ab82d74 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -20,6 +20,7 @@
 import warnings
 
 import paddle
+import paddle.nn as nn
 import paddle.nn.quant.quant_layers as quant_layers
 from paddle.fluid import dygraph, core, framework, unique_name
 from paddle.fluid.framework import IrGraph
@@ -32,6 +33,7 @@
 from paddle.fluid.log_helper import get_logger
 from .. import quantization_pass
 from . import utils
+from . import fuse_utils
 
 __all__ = ['ImperativeQuantAware']
 
@@ -52,6 +54,7 @@ def __init__(
             weight_bits=8,
             activation_bits=8,
             moving_rate=0.9,
+            fuse_conv_bn=False,
             weight_preprocess_layer=None,
             act_preprocess_layer=None,
             weight_quantize_layer=None,
@@ -76,6 +79,7 @@ def __init__(
             activation_bits(int): quantization bit number for activations.
             moving_rate(float): the parameter for 'moving_average_abs_max'
                 quantization.
+            fuse_conv_bn(bool): Whether to fuse conv and bn, default is False.
             weight_preprocess_layer(paddle.nn.Layer, optional): A paddle
                 Layer that defines how to preprocess weight before quantization.
                 Using this can quickly test if user's preprocess method works
@@ -188,6 +192,7 @@ def forward(self, inputs):
                 model_path="./imperative_model_qat")
         """
         super(ImperativeQuantAware, self).__init__()
+        self.fuse_conv_bn = fuse_conv_bn
 
         kwargs = {
             "quantizable_layer_type": quantizable_layer_type,
@@ -256,8 +261,13 @@ def forward(self, inputs):
         """
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
+
+        if self.fuse_conv_bn:
+            fuse_utils.fuse_conv_bn(model)
+
         self._quantize_inputs.apply(model)
         self._quantize_outputs.apply(model)
+        return model
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         self._quantize_outputs.save_quantized_model(layer, path, input_spec,
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index a4c7a2a2bf8df..d4c34efb7b900 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -126,6 +126,7 @@ def __init__(self,
                  onnx_format=False,
                  optimize_model=False,
                  is_use_cache_file=False,
+                 skip_tensor_list=None,
                  cache_dir=None):
         '''
         Constructor.
@@ -198,6 +199,7 @@ def __init__(self,
                 the model accuracy is usually higher when it is 'channel_wise_abs_max'.
             onnx_format(bool): Whether to export the quantized model with format of ONNX.
                 Default is False.
+            skip_tensor_list(list): List of skip quant tensor name.
             optimize_model(bool, optional): If set optimize_model as True, it applies
                 some passes to the model before quantization, and it supports
                 `conv2d/depthwise_conv2d + bn` pass so far. Some targets require the
@@ -301,6 +303,7 @@ def __init__(self,
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
         self._onnx_format = onnx_format
+        self._skip_tensor_list = skip_tensor_list
         self._is_full_quantize = is_full_quantize
         if is_full_quantize:
             self._quantizable_op_type = self._support_quantize_op_type
@@ -547,6 +550,12 @@ def collect_var_name(var_name_list, persistable_var_names, op_type):
         persistable_var_names = _all_persistable_var_names(self._program)
         for block_id in range(len(self._program.blocks)):
             for op in self._program.blocks[block_id].ops:
+                # skip quant form self._skip_tensor_list
+                if self._skip_tensor_list is not None:
+                    for inp_name in utils._get_op_input_var_names(op):
+                        if inp_name in self._skip_tensor_list:
+                            op._set_attr("op_namescope", "skip_quant")
+
                 op_type = op.type
                 if self._is_full_quantize and \
                     op_type not in self._quantizable_op_type:
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 30e2b4613b185..0140283b915ff 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -354,6 +354,7 @@ set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
 set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_qat_fuse PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_qat_user_defined PROPERTIES TIMEOUT 200)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 015ecb3d4a4e9..0d035390e2c00 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -56,13 +56,15 @@ def set_vars(self):
         self.onnx_format = False
         self.check_export_model_accuracy = True
         self.diff_threshold = 0.01
+        self.fuse_conv_bn = False
 
     def func_qat(self):
         self.set_vars()
 
         imperative_qat = ImperativeQuantAware(
             weight_quantize_type=self.weight_quantize_type,
-            activation_quantize_type=self.activation_quantize_type)
+            activation_quantize_type=self.activation_quantize_type,
+            fuse_conv_bn=self.fuse_conv_bn)
 
         with fluid.dygraph.guard():
             # For CI coverage
@@ -214,6 +216,7 @@ def set_vars(self):
         self.activation_quantize_type = 'moving_average_abs_max'
         self.onnx_format = True
         self.diff_threshold = 0.025
+        self.fuse_conv_bn = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index ff40b170345a8..94e0681d1f57e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -43,6 +43,7 @@ def set_vars(self):
         self.activation_quantize_type = 'moving_average_abs_max'
         self.diff_threshold = 0.01
         self.onnx_format = False
+        self.fuse_conv_bn = False
         print('weight_quantize_type', self.weight_quantize_type)
 
 
@@ -52,6 +53,7 @@ def set_vars(self):
         self.activation_quantize_type = 'moving_average_abs_max'
         self.onnx_format = True
         self.diff_threshold = 0.025
+        self.fuse_conv_bn = False
         print('weight_quantize_type', self.weight_quantize_type)
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
new file mode 100644
index 0000000000000..d580eb7ae7aef
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
@@ -0,0 +1,50 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import random
+import unittest
+import logging
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.log_helper import get_logger
+
+from test_imperative_qat import TestImperativeQat
+
+paddle.enable_static()
+
+os.environ["CPU_NUM"] = "1"
+if core.is_compiled_with_cuda():
+    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+class TestImperativeQatfuseBN(TestImperativeQat):
+    def set_vars(self):
+        self.weight_quantize_type = 'abs_max'
+        self.activation_quantize_type = 'moving_average_abs_max'
+        self.diff_threshold = 0.01
+        self.onnx_format = False
+        self.fuse_conv_bn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
index 85cabb6b5e9b7..89e0e099f44c2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
@@ -247,21 +247,21 @@ def run_test(self,
         self.assertLess(delta_value, diff_threshold)
 
 
-class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
-    def test_post_training_kl(self):
+class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
+    def test_post_training_avg(self):
         model_name = "nlp_lstm_fp32_model"
         model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
         model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
         data_name = "quant_lstm_input_data"
         data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
         data_md5 = "add84c754e9b792fea1fbd728d134ab7"
-        algo = "KL"
+        algo = "avg"
         round_type = "round"
         quantizable_op_type = ["mul", "lstm"]
         is_full_quantize = False
         is_use_cache_file = False
         is_optimize_model = False
-        diff_threshold = 0.01
+        diff_threshold = 0.02
         infer_iterations = 100
         quant_iterations = 10
         self.run_test(model_name, model_url, model_md5, data_name, data_url,
@@ -270,44 +270,21 @@ def test_post_training_kl(self):
                       diff_threshold, infer_iterations, quant_iterations)
 
 
-class TestPostTrainingKLForMnistAdaround(TestPostTrainingQuantization):
-    def test_post_training_kl(self):
+class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
+    def test_post_training_avg_onnx_format(self):
         model_name = "nlp_lstm_fp32_model"
         model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
         model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
         data_name = "quant_lstm_input_data"
         data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
         data_md5 = "add84c754e9b792fea1fbd728d134ab7"
-        algo = "KL"
-        round_type = "adaround"
-        quantizable_op_type = ["mul", "lstm"]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = False
-        diff_threshold = 0.01
-        infer_iterations = 100
-        quant_iterations = 10
-        self.run_test(model_name, model_url, model_md5, data_name, data_url,
-                      data_md5, algo, round_type, quantizable_op_type,
-                      is_full_quantize, is_use_cache_file, is_optimize_model,
-                      diff_threshold, infer_iterations, quant_iterations)
-
-
-class TestPostTrainingKLForMnistONNXFormat(TestPostTrainingQuantization):
-    def test_post_training_kl_onnx_format(self):
-        model_name = "nlp_lstm_fp32_model"
-        model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
-        model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
-        data_name = "quant_lstm_input_data"
-        data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
-        data_md5 = "add84c754e9b792fea1fbd728d134ab7"
-        algo = "KL"
+        algo = "avg"
         round_type = "round"
         quantizable_op_type = ["mul", "lstm"]
         is_full_quantize = False
         is_use_cache_file = False
         is_optimize_model = False
-        diff_threshold = 0.01
+        diff_threshold = 0.02
         infer_iterations = 100
         quant_iterations = 10
         onnx_format = True
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index c219d2fbf89a9..4c3a758f0e36d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -117,7 +117,8 @@ def generate_quantized_model(self,
                                  is_optimize_model=False,
                                  batch_size=10,
                                  batch_nums=10,
-                                 onnx_format=False):
+                                 onnx_format=False,
+                                 skip_tensor_list=None):
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -136,6 +137,7 @@ def generate_quantized_model(self,
             is_full_quantize=is_full_quantize,
             optimize_model=is_optimize_model,
             onnx_format=onnx_format,
+            skip_tensor_list=skip_tensor_list,
             is_use_cache_file=is_use_cache_file)
         ptq.quantize()
         ptq.save_quantized_model(self.int8_model_path)
@@ -154,7 +156,8 @@ def run_test(self,
                  batch_size=10,
                  infer_iterations=10,
                  quant_iterations=5,
-                 onnx_format=False):
+                 onnx_format=False,
+                 skip_tensor_list=None):
 
         origin_model_path = self.download_model(data_url, data_md5, model_name)
         origin_model_path = os.path.join(origin_model_path, model_name)
@@ -166,10 +169,10 @@ def run_test(self,
 
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model_name, quant_iterations * batch_size))
-        self.generate_quantized_model(origin_model_path, algo, round_type,
-                                      quantizable_op_type, is_full_quantize,
-                                      is_use_cache_file, is_optimize_model,
-                                      batch_size, quant_iterations, onnx_format)
+        self.generate_quantized_model(
+            origin_model_path, algo, round_type, quantizable_op_type,
+            is_full_quantize, is_use_cache_file, is_optimize_model, batch_size,
+            quant_iterations, onnx_format, skip_tensor_list)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model_name, infer_iterations * batch_size))
@@ -338,6 +341,27 @@ def test_post_training_mse(self):
                       infer_iterations, quant_iterations)
 
 
+class TestPostTrainingKLAdaroundForMnist(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "KL"
+        round_type = "adaround"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
+                      quantizable_op_type, is_full_quantize, is_use_cache_file,
+                      is_optimize_model, diff_threshold, batch_size,
+                      infer_iterations, quant_iterations)
+
+
 class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
     def test_post_training_mse_onnx_format(self):
         model_name = "mnist_model"
@@ -405,5 +429,38 @@ def test_post_training_mse_onnx_format_full_quant(self):
             onnx_format=onnx_format)
 
 
+class TestPostTrainingavgForMnistSkipOP(TestPostTrainingQuantization):
+    def test_post_training_avg_skip_op(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "avg"
+        round_type = "round"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        skip_tensor_list = ["fc_0.w_0"]
+        self.run_test(
+            model_name,
+            data_url,
+            data_md5,
+            algo,
+            round_type,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            batch_size,
+            infer_iterations,
+            quant_iterations,
+            skip_tensor_list=skip_tensor_list)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 498a1ec46cacd..629529ff1b965 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -383,7 +383,7 @@ def test_post_training_hist_mobilenetv1(self):
         is_full_quantize = False
         is_use_cache_file = False
         is_optimize_model = True
-        diff_threshold = 0.025
+        diff_threshold = 0.03
         self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold)
@@ -412,123 +412,6 @@ def test_post_training_abs_max_mobilenetv1(self):
                       is_optimize_model, diff_threshold)
 
 
-class TestPostTrainingAvgAdaRoundForMobilenetv1(TestPostTrainingQuantization):
-    def test_post_training_adaround_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "avg"
-        round_type = "adaround"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
-class TestPostTrainingAbsMaxAdaRoundForMobilenetv1(
-        TestPostTrainingQuantization):
-    def test_post_training_adaround_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "abs_max"
-        round_type = "adaround"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
-class TestPostTraininghistAdaroundForMobilenetv1(TestPostTrainingQuantization):
-    def test_post_training_hist_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "hist"
-        round_type = "adaround"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
-class TestPostTrainingKLAdaroundForMobilenetv1(TestPostTrainingQuantization):
-    def test_post_training_kl_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "KL"
-        round_type = "adaround"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-            "pool2d",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
-class TestPostTrainingEMDForMobilenetv1(TestPostTrainingQuantization):
-    def test_post_training_avg_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "emd"
-        round_type = "round"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
 class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_onnx_format_mobilenetv1(self):
         model = "MobileNet-V1"
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
index 304f31c2b1629..6dc3813fa6d0c 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -22,7 +22,7 @@
 from .. import core
 from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
 from ..multiprocess_utils import _cleanup_mmap, CleanupFuncRegistrar, MP_STATUS_CHECK_INTERVAL
-from ..framework import _non_static_mode
+from ..framework import _non_static_mode, _in_eager_without_dygraph_check
 from .flat import _flatten_batch
 
 # NOTE: queue has a different name in python2 and python3
@@ -339,10 +339,16 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
                     out_queue.put((idx, batch, None))
                 batch, structure = _flatten_batch(batch)
                 if use_shared_memory:
+                    # NOTE: In eager mode, Tensor._share_memory has no
+                    # effect, fall back to _array_to_share_memory_tensor
+                    def tensor_share_memory(tensor):
+                        if _in_eager_without_dygraph_check():
+                            return core._array_to_share_memory_tensor(tensor)
+                        return tensor._share_memory()
                     tensor_list = [
                         core._array_to_share_memory_tensor(b)
-                        if isinstance(b, np.ndarray) else b._share_memory()
-                        for b in batch
+                        if isinstance(b, np.ndarray) \
+                        else tensor_share_memory(b) for b in batch
                     ]
                     out_queue.put((idx, tensor_list, structure))
                     core._remove_tensor_list_mmap_fds(tensor_list)
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index f7d4be7ee6e3c..5da5dbbd7bdfc 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -276,9 +276,10 @@ def amp_guard(enable=True,
     if enable and not (tracer._expected_place.is_gpu_place() or
                        tracer._expected_place.is_xpu_place() or
                        tracer._expected_place.is_mlu_place() or
-                       tracer._expected_place.is_npu_place()):
+                       tracer._expected_place.is_npu_place() or
+                       tracer._expected_place.is_custom_place()):
         warnings.warn(
-            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.'
+            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace, and CustomPlace, current place is %s, so it makes no effect.'
             % tracer._expected_place)
         enable = False
     # For npu:
@@ -293,6 +294,10 @@ def amp_guard(enable=True,
     if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'):
         warnings.warn('MLUPlace only support float16 amp.')
         enable = False
+    # For custom device:
+    if tracer._expected_place.is_custom_place() and (dtype == 'bfloat16'):
+        warnings.warn('CustomPlace only support float16 amp.')
+        enable = False
     # For gpu float16: Compute Capability should >= 7.
     # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
     if tracer._expected_place.is_gpu_place():
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index c57290861942b..df79b5ab5e482 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -107,9 +107,10 @@ def __init__(self,
         if enable and not (tracer._expected_place.is_gpu_place() or
                            tracer._expected_place.is_xpu_place() or
                            tracer._expected_place.is_mlu_place() or
-                           tracer._expected_place.is_npu_place()):
+                           tracer._expected_place.is_npu_place() or
+                           tracer._expected_place.is_custom_place()):
             warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.'
                 % tracer._expected_place)
             enable = False
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index 0670c048c5e26..60043c42121bd 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -21,6 +21,10 @@
 from paddle.fluid import core
 from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
 from paddle.fluid.framework import Program
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 # NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
 ORIGI_INFO = "Original information of source code for ast node."
@@ -214,7 +218,7 @@ def ast_walk(transformed_node, static_node):
     def _as_list(x):
         if x is None:
             return []
-        return list(x) if isinstance(x, collections.Sequence) else [x]
+        return list(x) if isinstance(x, Sequence) else [x]
 
     transformed_node_list = _as_list(transformed_node)
     static_node_list = _as_list(static_node)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index bc1a0e30dd42d..b860740f71b25 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -196,10 +196,11 @@ def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
 
     def __hash__(self):
         error_msg = "Arguments to a `@paddle.jit.to_static` must be a hashable Python objects (or nested structures of these types)."
+        with_hook = self.kwargs.get("with_hook", False)
         return hash((id(self.function_spec),
                      make_hashable(self.input_args_with_spec, error_msg),
                      make_hashable(self.input_kwargs_with_spec, error_msg),
-                     self._spec_names_id, self.class_instance))
+                     self._spec_names_id, self.class_instance, with_hook))
 
     def __eq__(self, other):
         return (type(self) is type(other)) and hash(self) == hash(other)
@@ -413,6 +414,8 @@ def get_concrete_program(self, *args, **kwargs):
             Traced ConcreteProgram and executable translated Layer.
         """
 
+        with_hook = kwargs.get("with_hook", False)
+        if "with_hook" in kwargs: kwargs.pop("with_hook")
         # 1. unify args/kwargs and replace Tensor with InputSpec
         if len(args) != len(self._function_spec.args_name):
             args, kwargs = self._function_spec.unified_args_and_kwargs(args,
@@ -421,9 +424,13 @@ def get_concrete_program(self, *args, **kwargs):
             args, kwargs)
 
         # 2. generate cache key
-        cache_key = CacheKey(self._function_spec, input_args_with_spec,
-                             input_kwargs_with_spec, self._class_instance,
-                             **self._kwargs)
+        cache_key = CacheKey(
+            self._function_spec,
+            input_args_with_spec,
+            input_kwargs_with_spec,
+            self._class_instance,
+            **self._kwargs,
+            with_hook=with_hook)
 
         # 3. check whether hit the cache or build a new program for the input arguments
         concrete_program, partial_program_layer = self._program_cache[cache_key]
@@ -480,11 +487,13 @@ def foo(x, y):
         """
         return self.concrete_program_specify_input_spec(input_spec=None)
 
-    def concrete_program_specify_input_spec(self, input_spec=None):
+    def concrete_program_specify_input_spec(self,
+                                            input_spec=None,
+                                            with_hook=False):
         """
         Returns recent ConcreteProgram instance of decorated function while
         specifying input_spec. If the self._function_spec already has
-        input_spce, it will check the compatibility of input input_spec and
+        input_spec, it will check the compatibility of input input_spec and
         the self._function_spec.input_spec. If input input_spec=None, then
         this method uses self._function_spec.input_spec
 
@@ -516,12 +525,18 @@ def concrete_program_specify_input_spec(self, input_spec=None):
             has_input_spec = (desired_input_spec is not None)
             if has_input_spec:
                 concrete_program, _ = self.get_concrete_program(
-                    *desired_input_spec)
+                    *desired_input_spec, with_hook=with_hook)
                 return concrete_program
             else:
                 raise ValueError(
                     "No valid transformed program for {}.\n\t    Please specific `input_spec` in `@paddle.jit.to_static` or feed input tensor to call the decorated function at once.\n".
                     format(self._function_spec))
+        elif with_hook:
+            cache_key = self._program_cache._recent_cache_key
+            cache_key.kwargs["with_hook"] = True
+            concrete_program, _ = self._program_cache[cache_key]
+            return concrete_program
+
         # If more than one programs have been cached, return the recent converted program by default.
         elif cached_program_len > 1:
             logging_utils.warn(
@@ -588,6 +603,54 @@ def _verify_init_in_dynamic_mode(class_instance):
                     class_instance))
 
 
+class HookHelper(object):
+    """
+    Only For converting pre/post hooks operation in outermost layer while jit.save.
+    Because hooks in sublayer have been processed automatically.
+    """
+
+    def __init__(self, func, class_instance, with_hook=False):
+        self.func = func
+        self.class_instance = class_instance
+        self.with_hook = with_hook
+        self.need_apply_hook = with_hook and isinstance(
+            self.class_instance,
+            layers.Layer) and getattr(func, "__name__") == "forward"
+
+    def apply_pre_hooks(self, inputs):
+        """
+        Apply _forward_pre_hooks from outermost layer
+        """
+        if not self.need_apply_hook: return inputs
+
+        inputs = inputs[1:]
+        for forward_pre_hook in self.class_instance._forward_pre_hooks.values():
+            hook_result = forward_pre_hook(self.class_instance, inputs)
+            if hook_result is not None:
+                if not isinstance(hook_result, tuple):
+                    hook_result = (hook_result, )
+                inputs = hook_result
+
+        return [self.class_instance] + list(inputs)
+
+    def apply_post_hooks(self, inputs, outputs):
+        """
+        Apply _forward_post_hooks from outermost layer
+        """
+        if not self.need_apply_hook: return outputs
+
+        inputs = inputs[1:]
+        for forward_post_hook in self.class_instance._forward_post_hooks.values(
+        ):
+            hook_result = forward_post_hook(self.class_instance, inputs,
+                                            outputs)
+            if hook_result is not None:
+                outputs = hook_result
+
+        inputs.insert(0, self.class_instance)
+        return outputs
+
+
 class ConcreteProgram(object):
 
     __slots__ = [
@@ -629,6 +692,9 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
         # Transforms dygraph function into static function and caches it.
         dygraph_function = func_spec.dygraph_function
         static_func = convert_to_static(dygraph_function)
+        # apply pre\post hook for outermost layer
+        hook_helper = HookHelper(dygraph_function, class_instance,
+                                 kwargs.get("with_hook", False))
 
         main_program, startup_program = framework.Program(), framework.Program()
         # Note: The random seed should be synchronized into cached program
@@ -642,12 +708,13 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
         with framework.program_guard(main_program, startup_program):
             with _switch_declarative_mode_guard_(is_declarative=True):
                 # 1. Adds `fluid.data` layers for input if needed
-                inputs = func_spec.to_static_inputs_with_spec(input_spec,
-                                                              main_program)
+                static_inputs = func_spec.to_static_inputs_with_spec(
+                    input_spec, main_program)
                 _kwargs = func_spec.to_static_inputs_with_spec(
                     input_kwargs_spec, main_program)
                 if class_instance:
-                    inputs = tuple([class_instance] + list(inputs))
+                    static_inputs = tuple([class_instance] + list(
+                        static_inputs))
 
                 # 2. Gets all ParamBases and buffered VarBases in the function
                 all_parameters_and_buffers = _extract_indeed_params_buffers(
@@ -658,10 +725,13 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
                         class_instance, False)), param_guard(
                             get_buffers(class_instance, False)):
                     try:
+                        # only for jit.save, do nothing while train and eval process
+                        inputs = hook_helper.apply_pre_hooks(static_inputs)
                         if _kwargs:
                             outputs = static_func(*inputs, **_kwargs)
                         else:
                             outputs = static_func(*inputs)
+                        outputs = hook_helper.apply_post_hooks(inputs, outputs)
                     except BaseException as e:
                         # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
                         error.attach_error_data(e)
@@ -679,7 +749,7 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
         main_program = update_op_callstack_with_origin_info(main_program)
 
         return ConcreteProgram(
-            inputs=inputs,
+            inputs=static_inputs,
             outputs=outputs,
             parameters=all_parameters_and_buffers,
             function=dygraph_function,
@@ -709,6 +779,7 @@ def __init__(self):
         self._caches = collections.OrderedDict()
         # trace mostly recent used program 
         self._recent_key = None
+        self._recent_cache_key = None
 
     def _build_once(self, cache_key):
         concrete_program = ConcreteProgram.from_func_spec(
@@ -724,6 +795,7 @@ def __getitem__(self, item):
             raise ValueError('type(item) should be CacheKey, but received %s' %
                              type_name(item))
         item_id = hash(item)
+        self._recent_cache_key = item
         self._recent_key = item_id
         if item_id not in self._caches:
             self._caches[item_id] = self._build_once(item)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 7957b33bf1dce..e0e259215c509 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -302,6 +302,7 @@ def __init__(self):
 
         # If True, It will save inference program only, and do not save params of Program
         self._program_only = False
+        self.with_hook = False
 
     @property
     def output_spec(self):
@@ -370,7 +371,7 @@ def keep_name_table(self, value):
 
 
 def _parse_save_configs(configs):
-    supported_configs = ['output_spec']
+    supported_configs = ['output_spec', "with_hook"]
 
     # input check
     for key in configs:
@@ -382,6 +383,7 @@ def _parse_save_configs(configs):
     # construct inner config
     inner_config = _SaveLoadConfig()
     inner_config.output_spec = configs.get('output_spec', None)
+    inner_config.with_hook = configs.get('with_hook', False)
 
     return inner_config
 
@@ -454,11 +456,15 @@ def _get_input_var_names(inputs, input_spec):
     return result_list
 
 
-def _get_output_vars(outputs, output_spec):
+def _get_output_vars(outputs, output_spec, with_hook=False):
     name_no_exists_error = "The tensor `%s` does not exists. " \
         "Please make sure the name of example Tensor " \
         "in configs.output_spec is the output tensor of " \
         "Layer.forward method."
+    if output_spec and with_hook:
+        raise RuntimeError(
+            "Currently not support specify output_spec while founding pre/post hooks in your outermost layer."
+        )
     result_list = []
     output_vars_dict = OrderedDict()
     for var in flatten(outputs):
@@ -830,10 +836,16 @@ def fun(inputs):
 
     # parse configs
     configs = _parse_save_configs(configs)
+    # whether outermost layer has pre/post hook, if does, we need also save
+    # these operators in program. 
+    with_hook = configs.with_hook
+
     scope = core.Scope()
     extra_var_info = dict()
     if isinstance(layer, Layer):
         functions = dir(inner_layer)
+        if inner_layer._forward_pre_hooks or inner_layer._forward_post_hooks:
+            with_hook = True
     else:
         # layer is function
         functions = [layer, ]
@@ -842,7 +854,7 @@ def fun(inputs):
             static_func = getattr(inner_layer, attr_func, None)
             if isinstance(static_func, StaticFunction):
                 concrete_program = static_func.concrete_program_specify_input_spec(
-                    inner_input_spec)
+                    inner_input_spec, with_hook=with_hook)
             elif 'forward' == attr_func:
                 # transform in jit.save, if input_spec is incomplete, declarative will throw error
                 # inner_input_spec is list[InputSpec], it should be packed with same structure
@@ -852,7 +864,8 @@ def fun(inputs):
                                                         inner_input_spec)
                 static_forward = declarative(
                     inner_layer.forward, input_spec=inner_input_spec)
-                concrete_program = static_forward.concrete_program
+                concrete_program = static_forward.concrete_program_specify_input_spec(
+                    with_hook=with_hook)
                 # the input_spec has been used in declarative, which is equal to
                 # @declarative with input_spec and jit.save without input_spec,
                 # avoid needless warning
@@ -943,8 +956,10 @@ def fun(inputs):
         # the rule is like [ Get input variables name ]. For output var,
         # we only support VarBase spec, and actually, we only need the
         # var name of output, and we don't recommended to use output_spec
+        # print(concrete_program.main_program)
+        # print(concrete_program.outputs, configs.output_spec)
         output_vars = _get_output_vars(concrete_program.outputs,
-                                       configs.output_spec)
+                                       configs.output_spec, with_hook)
 
         # 5. save inference model
         from paddle.fluid.io import save_inference_model
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 56b743f4463ae..c6ff3a583d6a3 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1386,7 +1386,8 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
 
         def _can_use_interpreter_core(program, place):
             if core.is_compiled_with_npu() or core.is_compiled_with_xpu(
-            ) or core.is_compiled_with_mlu() or core.is_compiled_with_ipu():
+            ) or core.is_compiled_with_mlu() or core.is_compiled_with_ipu(
+            ) or isinstance(place, core.CustomPlace):
                 return False
 
             compiled = isinstance(program, compiler.CompiledProgram)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 817e742fd1d8a..16a5e25472557 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2863,8 +2863,22 @@ def _to_readable_code(self, skip_op_callstack=True):
                     attrs_str += ", "
                 continue
 
+            # it is bytes of serialized protobuf 
+            if self.type == 'cinn_launch' and name == 'compilation_key':
+                # value = core.get_readable_comile_key(self.desc)
+                v = self.desc.attr(name)
+                prog = Program()
+                prog = prog.parse_from_string(v)
+                s = prog._to_readable_code()
+                lines = s.split('\n')
+                value = '\n'.join(['      ' + line for line in lines])
+                value = '\n' + value
+            else:
+                value = self.desc.attr(name)
+
             a = "{name} = {value}".format(
-                name=name, type=attr_type, value=self.desc.attr(name))
+                name=name, type=attr_type, value=value)
+
             attrs_str += a
             if i != len(attr_names) - 1:
                 attrs_str += ", "
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 8d803c0d5bd7d..40ff41fe89f47 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -1139,10 +1139,11 @@ def minimize(self,
             from paddle.fluid.transpiler.collective import MultiThread
             # check start program
             if program_mode not in [
-                    "all_reduce", "fuse_all_reduce", "all_gather"
+                    "all_reduce", "fuse_all_reduce", "all_gather",
+                    "all_reduce_xpu"
             ]:
                 raise ValueError("You should set program_mode in [ all_reduce, \
-                                fuse_all_reduce, all_gather ]")
+                                fuse_all_reduce, all_gather, all_reduce_xpu ]")
             env = self.get_dist_env()
             if not isinstance(losses, list):
                 startup_programs = [startup_programs]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8b10a5f454e69..200e8feec1e6a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6781,7 +6781,10 @@ def lod_append(x, level):
             x = fluid.layers.data(name='x', shape=[6, 10], lod_level=1)
             out = fluid.layers.lod_append(x, [1,1,1,1,1,1])
     """
-    from collections import Iterable
+    try:
+        from collections.abc import Iterable
+    except:
+        from collections import Iterable
     if x is None:
         raise ValueError("Input(x) can't be None.")
     if (not isinstance(level, Iterable)) and (not isinstance(level, Variable)):
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 1b9c87f1c0d06..707a1dc2cbc2f 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -33,6 +33,10 @@
 from ..framework import _non_static_mode
 from ..param_attr import ParamAttr
 from ..data_feeder import check_variable_and_dtype, check_type, check_dtype
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 __all__ = [
     'RNNCell',
@@ -163,7 +167,7 @@ def _is_shape_sequence(seq):
             # TODO: Add check for the illegal
             if isinstance(seq, dict):
                 return True
-            return (isinstance(seq, collections.Sequence) and
+            return (isinstance(seq, Sequence) and
                     not isinstance(seq, six.string_types))
 
         class Shape(object):
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 693fbf20e64a8..a9b1fa6ff0205 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -1470,6 +1470,11 @@ def range(start, end, step, dtype, name=None):
             # [3, 4, 5, 6]
 
     """
+    out_shape = None
+    if not isinstance(start, Variable) and not isinstance(
+            end, Variable) and not isinstance(step, Variable):
+        out_shape = [int(math.ceil((end - start) / step))]
+
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
@@ -1500,11 +1505,6 @@ def range(start, end, step, dtype, name=None):
         out.stop_gradient = True
         return out
 
-    out_shape = None
-    if not isinstance(start, Variable) and not isinstance(
-            end, Variable) and not isinstance(step, Variable):
-        out_shape = [int(math.ceil((end - start) / step))]
-
     check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
                 'range/arange')
     helper = LayerHelper('range', **locals())
@@ -1516,6 +1516,8 @@ def range(start, end, step, dtype, name=None):
                 'Step': step},
         outputs={'Out': out})
     out.stop_gradient = True
+    if out_shape is not None:
+        out.desc.set_shape(out_shape)
     return out
 
 
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index c30f41f6a20d9..5d781a437fe8f 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -21,6 +21,10 @@
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..layer_helper import LayerHelper
 from sys import version_info
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 
 def convert_to_list(value, n, name, dtype=int):
@@ -74,8 +78,7 @@ def is_sequence(seq):
     """
     if isinstance(seq, dict):
         return True
-    return (isinstance(seq, collections.Sequence) and
-            not isinstance(seq, six.string_types))
+    return (isinstance(seq, Sequence) and not isinstance(seq, six.string_types))
 
 
 def _hash_with_id(*args):
@@ -148,7 +151,7 @@ def _sequence_like(instance, args):
         return type(instance)((key, result[key])
                               for key in six.iterkeys(instance))
     elif (isinstance(instance, tuple) and hasattr(instance, "_fields") and
-          isinstance(instance._fields, collections.Sequence) and
+          isinstance(instance._fields, Sequence) and
           all(isinstance(f, six.string_types) for f in instance._fields)):
         # This is a namedtuple
         return type(instance)(*args)
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 841c58821d7a1..3ea3af9ed1cb5 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -19,6 +19,7 @@
 import threading
 import paddle
 import time
+import copy
 
 from .framework import Program, Variable, program_guard, default_main_program, default_startup_program, _non_static_mode, cpu_places, _current_expected_place, _in_eager_without_dygraph_check
 from .executor import global_scope
@@ -214,7 +215,7 @@ def get_sub_dataset(self, dataset, batch_size):
         return sub_dataset
 
     def get_autotune_loader(self):
-        loader = self.loader
+        loader = copy.copy(self.loader)
         batch_size = self.loader.batch_sampler.batch_size
         if isinstance(self.loader.batch_sampler,
                       paddle.io.DistributedBatchSampler):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 32d8f5e3847c8..08e24f86a29a4 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -25,6 +25,7 @@ list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
 list(APPEND DIST_TEST_OPS test_static_model_parallel)
 list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward)
 list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_attention)
+list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_multi_transformer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -128,6 +129,7 @@ if(NOT WITH_GPU)
     LIST(REMOVE_ITEM TEST_OPS test_fused_feedforward_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_multi_transformer_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
 endif()
 
@@ -912,6 +914,7 @@ set_tests_properties(test_parallel_executor_crf test_sync_batch_norm_op test_inp
         test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
         test_distributed_fused_lamb_op_with_clip
         test_distributed_fused_lamb_op_without_clip
+        test_distributed_fused_lamb_op_with_gradient_merge
         test_parallel_executor_fetch_isolated_var
         PROPERTIES LABELS "RUN_TYPE=DIST")
 
@@ -1045,6 +1048,7 @@ set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_without_clip PROPERTIES TIMEOUT 120)
+set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
@@ -1065,6 +1069,7 @@ set_tests_properties(test_lstm_cudnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_stack_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80)
+set_tests_properties(test_einsum_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_qr_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_deformable_psroi_pooling PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
@@ -1186,6 +1191,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
         set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120)
         set_tests_properties(test_static_model_parallel_fused_attention PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_static_model_parallel_fused_multi_transformer PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_split_embedding
             test_collective_split_embedding_none_divisible
             test_collective_split_row_linear
@@ -1229,6 +1235,7 @@ set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
 set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000)
 set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
+set_tests_properties(test_tensordot PROPERTIES ENVIRONMENT "FLAGS_USE_STANDALONE_EXECUTOR=False")
 set_tests_properties(test_cuda_memory_reserved PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")
 if (WITH_GLOO)
     set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly PROPERTIES TIMEOUT 30)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 97a3092f11fd2..4d052f7e90cd3 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -12,6 +12,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS})
     py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
     set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+    py_test_modules(test_high_order_grad MODULES test_high_order_grad ENVS ${dist_ENVS})
+    set_tests_properties(test_high_order_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
 
     py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS})
     py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index d7321066ed9d9..b039bb76dcb03 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -127,9 +127,16 @@ def train():
     engine.prepare(optimizer, loss)
     engine.fit(dataset,
                batch_size=batch_size,
-               steps_per_epoch=batch_num * batch_size)
-    engine.save('./mlp')
-    engine.load('./mlp')
+               steps_per_epoch=batch_num * batch_size,
+               sample_generator=True)
+
+    eval_dataset = MyDataset(batch_size)
+    engine.prepare(optimizer, loss, mode='eval')
+    engine.evaluate(eval_dataset, batch_size)
+
+    test_dataset = MyDataset(batch_size)
+    engine.prepare(mode='predict')
+    engine.predict(test_dataset, batch_size)
     engine.save('./mlp_inf', training=False, mode='predict')
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_predict_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_predict_api.py
deleted file mode 100644
index 5f7c018ee4f16..0000000000000
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_predict_api.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import time
-import paddle.fluid as fluid
-import copy
-import os
-import numpy as np
-import subprocess
-import paddle
-import paddle.nn as nn
-import paddle.fluid as fluid
-import paddle.static as static
-import paddle.nn.functional as F
-import paddle.utils as utils
-from paddle.fluid import layers
-from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
-
-paddle.enable_static()
-global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
-batch_size = 1
-batch_num = 10
-hidden_size = 1024
-image_size = hidden_size
-
-paddle.seed(44)
-
-
-class MyDataset(Dataset):
-    def __init__(self, num_samples):
-        super(MyDataset, self).__init__()
-        self.num_samples = num_samples
-
-    def __getitem__(self, index):
-        input = np.random.uniform(size=image_size).astype("float32")
-        return input
-
-    def __len__(self):
-        return self.num_samples
-
-
-class MLPLayer(nn.Layer):
-    def __init__(self,
-                 hidden_size=1024,
-                 intermediate_size=4 * 1024,
-                 dropout_ratio=0.1,
-                 initializer_range=0.02):
-        super(MLPLayer, self).__init__()
-        d_model = hidden_size
-        dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
-        bias_attr = None
-
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
-        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
-        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
-        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
-
-    def forward(self, input):
-        out = self.norm(input)
-        out = self.linear0(input)
-        auto.shard_tensor(
-            self.linear0.weight,
-            dist_attr={
-                "process_mesh": global_process_mesh,
-                "dims_mapping": [-1, 0]
-            })
-        out = F.gelu(out, approximate=True)
-        out = self.linear1(out)
-        auto.shard_tensor(
-            self.linear1.weight,
-            dist_attr={
-                "process_mesh": global_process_mesh,
-                "dims_mapping": [0, -1]
-            })
-        out = self.dropout(out)
-        out = self.linear2(out)
-        return out
-
-
-def train():
-    mlp = MLPLayer(
-        hidden_size=hidden_size,
-        intermediate_size=4 * hidden_size,
-        dropout_ratio=0.1,
-        initializer_range=0.02)
-
-    dataset = MyDataset(batch_num * batch_size)
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-
-    dist_strategy = fleet.DistributedStrategy()
-    # init parallel optimizer
-    dist_strategy.semi_auto = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
-
-    engine = Engine(mlp, inputs_spec=inputs_spec, strategy=dist_strategy)
-    engine.prepare(mode='predict')
-    engine.predict(dataset, batch_size=batch_size)
-
-
-if __name__ == "__main__":
-    train()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
new file mode 100644
index 0000000000000..9a9efe7ab2dd0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import paddle
+import unittest
+import numpy as np
+import paddle.distributed.auto_parallel as auto
+
+from paddle.static import InputSpec
+from paddle.distributed import fleet
+from paddle.incubate.autograd import Hessian
+from paddle.distributed.auto_parallel.engine import Engine
+
+
+class FCNet:
+    def __init__(self, num_ins, num_outs, num_layers, hidden_size):
+        self.num_ins = num_ins
+        self.num_outs = num_outs
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.activation = paddle.tanh
+
+        self.weights = []
+        self.biases = []
+        for i in range(self.num_layers):
+            if i == 0:
+                lsize = self.num_ins
+                rsize = self.hidden_size
+            elif i == (self.num_layers - 1):
+                lsize = self.hidden_size
+                rsize = self.num_outs
+            else:
+                lsize = self.hidden_size
+                rsize = self.hidden_size
+
+            w = paddle.static.create_parameter(
+                shape=[lsize, rsize], dtype="float32", is_bias=False)
+            b = paddle.static.create_parameter(
+                shape=[rsize], dtype="float32", is_bias=True)
+            self.weights.append(w)
+            self.biases.append(b)
+
+    def nn_func(self, ins):
+        u = ins
+        for i in range(self.num_layers - 1):
+            u = paddle.nn.functional.linear(u, self.weights[i], self.biases[i])
+            u = self.activation(u)
+        u = paddle.nn.functional.linear(u, self.weights[-1], self.biases[-1])
+        return u
+
+
+class LaplaceModel(paddle.nn.Layer):
+    def __init__(self, num_ins=2, num_outs=1, num_layers=5, hidden_size=20):
+        super(LaplaceModel, self).__init__()
+        self.net = FCNet(
+            num_ins=num_ins,
+            num_outs=num_outs,
+            num_layers=num_layers,
+            hidden_size=hidden_size)
+
+    def forward(self, inputs, bc_index):
+        inputs.stop_gradient = False
+        outputs = self.net.nn_func(inputs)
+        # eq_loss
+        hes = Hessian(self.net.nn_func, inputs, is_batched=True)
+        eq_loss = paddle.norm(hes[:, 0, 0] + hes[:, 1, 1], p=2)
+        # bc_loss
+        bc_u = paddle.index_select(outputs, bc_index)
+        return eq_loss, bc_u
+
+
+class LaplaceDataset:
+    def __init__(self, num_sample):
+        self.num_sample = num_sample
+
+    def __getitem__(self, index):
+        x = np.linspace(0, 0.9, 10)
+        y = np.linspace(0, 0.9, 10)
+        bc_value = np.random.rand(36).reshape(36, 1).astype('float32')
+
+        domain_space = []
+        bc_index = []
+        for j in range(len(y)):
+            for i in range(len(x)):
+                domain_space.append([x[i], y[j]])
+                if i == 0 or i == 9 or j == 0 or j == 9:
+                    bc_index.append(i + 10 * j)
+        domain_space = np.array(domain_space, dtype='float32')
+        bc_index = np.array(bc_index, dtype='int64')
+
+        return domain_space, bc_index, bc_value
+
+    def __len__(self):
+        return self.num_sample
+
+
+def loss_func(eq_loss, bc_u, bc_value):
+    bc_diff = bc_u - bc_value
+    bc_loss = paddle.norm(bc_diff, p=2)
+    loss = eq_loss + bc_loss
+    return loss
+
+
+def main():
+    # dataset
+    train_dataset = LaplaceDataset(10)
+    # optimizer
+    optimizer = paddle.optimizer.Adam(learning_rate=0.001)
+    # model
+    laplace = LaplaceModel()
+
+    # spec
+    inputs_spec = [
+        InputSpec([100, 2], 'float32', 'x'), InputSpec([36], 'int64', 'bc_idx')
+    ]
+    labels_spec = InputSpec([36, 1], 'float32', 'bc_v')
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    engine = Engine(
+        laplace,
+        inputs_spec=inputs_spec,
+        labels_spec=labels_spec,
+        strategy=dist_strategy)
+    paddle.seed(1234 + engine._cur_rank)
+    engine.prepare(optimizer=optimizer, loss=loss_func)
+    res = engine.fit(train_dataset, sample_generator=False)
+    assert np.allclose(res[-1], 2.840593)
+
+    dist_context = engine.dist_context
+    block = engine.main_program.global_block()
+    ops = block.ops
+    for op in ops:
+        if op.type == 'p_norm':
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            assert op_dist_attr.impl_type == 'p_norm'
+        if 'x' in op.input_arg_names:
+            out_name = op.output_arg_names[0]
+            assert block.vars[out_name].shape[0] == 50
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index 5ca12bc1e0e17..efcad7eb11268 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -49,28 +49,6 @@ def test_engine_api(self):
         if os.path.exists('rank_mapping.csv'):
             os.remove('rank_mapping.csv')
 
-    def test_engine_predict(self):
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        launch_model_path = os.path.join(file_dir, "engine_predict_api.py")
-
-        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
-            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
-        else:
-            coverage_args = []
-
-        cmd = [sys.executable, "-u"] + coverage_args + [
-            "-m", "launch", "--gpus", "0,1", launch_model_path
-        ]
-
-        process = subprocess.Popen(cmd)
-        process.wait()
-        self.assertEqual(process.returncode, 0)
-
-        # Remove unnecessary files
-        log_path = os.path.join(file_dir, "log")
-        if os.path.exists(log_path):
-            shutil.rmtree(log_path)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py
new file mode 100644
index 0000000000000..ab4a34cf99cbf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestHighOrderGrad(unittest.TestCase):
+    def test_dp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "high_order_grad.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--gpus", "0,1", launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        # Remove unnecessary files
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index 00d2a1f71d6bd..0af7d40a2f02e 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -149,6 +149,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
             kwargs['exclude_from_weight_decay_fn'] = exclude_fn
             kwargs['lamb_weight_decay'] = 0.1
 
+            gm_steps = kwargs['gradient_accumulation_steps']
             if use_distributed_lamb:
                 optimizer_class = DistributedFusedLamb
                 kwargs = dict(kwargs)
@@ -163,6 +164,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
                 )
                 kwargs['grad_clip'] = GradClipDecorator(base_clip,
                                                         clip_after_allreduce)
+                kwargs.pop('gradient_accumulation_steps', None)
 
             optimizer = optimizer_class(**kwargs)
             get_parameter = optimizer._get_parameter
@@ -173,6 +175,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
             if use_fp16:
                 if not use_distributed_lamb:
                     optimizer._multi_precision = True
+
                 optimizer = paddle.static.amp.decorate(
                     optimizer,
                     amp_list,
@@ -180,6 +183,13 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
                     use_dynamic_loss_scaling=False,
                     use_pure_fp16=use_fp16,
                     use_fp16_guard=use_fp16)
+                amp_init = optimizer.amp_init
+            else:
+                amp_init = None
+
+            if gm_steps > 1 and not use_distributed_lamb:
+                optimizer = paddle.fluid.optimizer.GradientMergeOptimizer(
+                    optimizer, k_steps=gm_steps, avg=False)
 
             params_grads = optimizer.backward(loss, startup)
             op_num = len(main.global_block().ops)
@@ -211,7 +221,7 @@ def gen_random_grad_tensor(grad):
         return grad_t
 
     def reader():
-        for _ in range(5):
+        for _ in range(6):
             yield dict(
                 [(grad.name, gen_random_grad_tensor(grad)) for grad in grads])
 
@@ -223,8 +233,8 @@ def reader():
         place = paddle.CUDAPlace(dev_id)
         exe = paddle.static.Executor(place)
         exe.run(startup)
-        if use_fp16:
-            optimizer.amp_init(place)
+        if amp_init is not None:
+            amp_init(place)
 
         master_p_ts = []
         for p in params:
@@ -258,10 +268,12 @@ def config(self):
             distutils.util.strtobool(
                 os.getenv('CLIP_AFTER_ALLREDUCE', 'True')))
         max_global_norm = float(os.getenv('MAX_GLOBAL_NORM', -1.0))
+        gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1))
         print('clip_after_allreduce = {}, max_global_norm = {}'.format(
             clip_after_allreduce, max_global_norm))
         return {
             'clip_after_allreduce': clip_after_allreduce,
+            'gradient_accumulation_steps': gm_steps,
             'grad_clip': paddle.nn.ClipGradByGlobalNorm(max_global_norm)
             if max_global_norm > 0 else None,
         }
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
new file mode 100644
index 0000000000000..dcb41cfc6aba7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+
+import numpy as np
+
+
+def forward_post_hook1(layer, input, output):
+    return output + output
+
+
+def forward_pre_hook1(layer, input):
+    input_return = (input[0] * 2, )
+    return input_return
+
+
+class SimpleNet(paddle.nn.Layer):
+    def __init__(self, ):
+        super(SimpleNet, self).__init__()
+        self.fc1 = paddle.nn.Linear(10, 10)
+        # sublayer1 register post hook
+        self.fc1.register_forward_post_hook(forward_post_hook1)
+
+        self.fc2 = paddle.nn.Linear(10, 10)
+        # sublayer2 register pre hook
+        self.fc2.register_forward_pre_hook(forward_pre_hook1)
+
+        # register pre/post hook
+        self.register_forward_pre_hook(forward_pre_hook1)
+        self.register_forward_post_hook(forward_post_hook1)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        out = paddle.mean(x)
+
+        return out
+
+
+class TestNestLayerHook(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        self.x = paddle.randn([4, 10])
+        self.path = "./net_hook"
+
+    def train_net(self, to_static=False):
+        paddle.seed(2022)
+        net = SimpleNet()
+        if to_static:
+            net = paddle.jit.to_static(net)
+        out = net(self.x)
+
+        if to_static:
+            paddle.jit.save(net, self.path)
+
+        return out.numpy()[0]
+
+    def load_train(self):
+        net = paddle.jit.load(self.path)
+        out = net(self.x)
+        return out.numpy()[0]
+
+    def test_hook(self):
+        dy_out = self.train_net(to_static=False)
+        st_out = self.train_net(to_static=True)
+        load_out = self.load_train()
+        print(st_out, dy_out, load_out)
+        self.assertTrue(
+            np.allclose(st_out, dy_out),
+            msg='dygraph_res is {}\nstatic_res is {}'.format(dy_out, st_out))
+        self.assertTrue(
+            np.allclose(st_out, load_out),
+            msg='load_out is {}\nstatic_res is {}'.format(load_out, st_out))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
index ab836b088b09f..872d419ff8928 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
@@ -20,9 +20,6 @@
 
 from simnet_dygraph_model_v2 import BOW, HingeLoss
 
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
-
 SEED = 102
 random.seed(SEED)
 
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
index 569d994b831b6..defbffe8f2020 100644
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@@ -27,6 +27,10 @@
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import _append_grad_suffix_, _as_list
 from paddle.fluid.framework import _test_eager_guard
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 
 def _product(t):
@@ -91,7 +95,7 @@ def var_to_np_array_in_scope(scope, place, name):
 def make_jacobian(x, y_size, np_dtype):
     if isinstance(x, fluid.framework.Variable):
         return np.zeros((_product(x.shape), y_size), dtype=np_dtype)
-    elif isinstance(x, collections.Sequence):
+    elif isinstance(x, Sequence):
         jacobians = list(
             filter(lambda t: t is not None, (make_jacobian(
                 item, y_size, np_dtype) for item in x)))
@@ -556,7 +560,10 @@ def get_static_double_grad(x,
     # so, they are also the input of second-order backward.
     x += y_grads
     x_init += dy_init
-    y = dx
+
+    # filter None in dx for DX/DY may be None in kernel
+    filted_dx = [dxi for dxi in dx if dxi is not None]
+    y = filted_dx
 
     # check input arguments
     x = _as_list(x)
@@ -615,6 +622,7 @@ def get_static_double_grad(x,
 def get_eager_double_grad(func,
                           x_init=None,
                           dy_init=None,
+                          place=None,
                           return_mid_result=False):
     """
     Get Double Grad result of dygraph.
@@ -623,6 +631,7 @@ def get_eager_double_grad(func,
         func: A wrapped dygraph function that its logic is equal to static program
         x_init (numpy.array|list[numpy.array]|None): the init value for input x.
         dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output.
+        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
         return_mid_result (bool): A flag that controls the return content.
     Returns:
         If 'return_mid_result' set True. 
@@ -631,6 +640,10 @@ def get_eager_double_grad(func,
         If 'return_mid_result' set False. 
         A list of numpy array that stores second derivative result calulated by dygraph.
     """
+    if isinstance(place, fluid.CPUPlace):
+        paddle.set_device("cpu")
+    if isinstance(place, fluid.CUDAPlace):
+        paddle.set_device("gpu")
     inputs = []
     dys = []
     for x in x_init:
@@ -644,7 +657,12 @@ def get_eager_double_grad(func,
     # calculate first derivative
     outputs = func(inputs)
     d_inputs = paddle.grad(
-        outputs=outputs, inputs=inputs, grad_outputs=dys, create_graph=True)
+        outputs=outputs,
+        inputs=inputs,
+        grad_outputs=dys,
+        create_graph=True,
+        allow_unused=True)
+    d_inputs = [d_input for d_input in d_inputs if d_input is not None]
 
     # calcluate second derivative
     inputs = inputs + dys
@@ -659,15 +677,20 @@ def get_eager_double_grad(func,
         ddy = paddle.ones(shape=d_input.shape, dtype=d_input.dtype)
         ddy.stop_gradient = False
         ddys.append(ddy)
+
     dd_inputs = paddle.grad(
         outputs=d_inputs,
         inputs=inputs,
         grad_outputs=ddys,
-        create_graph=create_graph)
+        create_graph=create_graph,
+        allow_unused=True)
+
     if return_mid_result:
         return dd_inputs, inputs + ddys
     else:
-        return [dd_input.numpy() for dd_input in dd_inputs]
+        return [
+            dd_input.numpy() for dd_input in dd_inputs if dd_input is not None
+        ]
 
 
 def double_grad_check_for_dygraph(func,
@@ -689,7 +712,6 @@ def double_grad_check_for_dygraph(func,
         y (Variable|list[Variable]): output variables to the program.
         x_init (numpy.array|list[numpy.array]|None): the init value for input x.
         place (fluid.CPUPlace or fluid.CUDAPlace): the device.
-        eps (float): perturbation for finite differences.
         atol (float): absolute tolerance.
         rtol (float): relative tolerance.
         raise_exception (bool): whether to raise an exception if
@@ -718,19 +740,25 @@ def fail_test(msg):
 
     paddle.disable_static()
     with _test_eager_guard():
-        eager_double_grad = get_eager_double_grad(func, x_init, y_grads_init)
+        eager_double_grad = get_eager_double_grad(func, x_init, y_grads_init,
+                                                  place)
     paddle.enable_static()
 
     static_double_grad = get_static_double_grad(x, y, x_init, y_grads_init,
                                                 place)
 
+    if len(static_double_grad) != len(eager_double_grad):
+        msg = "The output grad tensor's number of static graph is different with dygraph, " \
+            "please check the python api unit test used."
+        raise RuntimeError(msg)
+
     for i in six.moves.xrange(len(static_double_grad)):
         if not np.allclose(static_double_grad[i], eager_double_grad[i], rtol,
                            atol):
-            msg = 'Check eager double result fail. Mismatch between static_graph double grad %s ' \
-                'and eager double grad %s on %s,\n' \
+            msg = 'Check eager double result fail. Mismatch between static_graph double grad ' \
+                'and eager double grad on %s, the output double grad tensor\'s index is : %d \n' \
                 'static:%s\n eager:%s\n' \
-                % (static_double_grad[i].name, eager_double_grad[i].name, str(place), static_double_grad[i], eager_double_grad[i])
+                % (str(place), i, static_double_grad[i], eager_double_grad[i])
             return fail_test(msg)
 
 
@@ -790,6 +818,7 @@ def get_static_triple_grad(x,
 def get_eager_triple_grad(func,
                           x_init=None,
                           dy_init=None,
+                          place=None,
                           return_mid_result=False):
     """
     Get triple Grad result of dygraph.
@@ -798,12 +827,13 @@ def get_eager_triple_grad(func,
         func: A wrapped dygraph function that its logic is equal to static program
         x_init (numpy.array|list[numpy.array]|None): the init value for input x.
         dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output.
+        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
         return_mid_result (list[Tensor], list[Tensor]): If set True, the 
     Returns:
         A list of numpy array that stores second derivative result calulated by dygraph
     """
     dd_y, dd_x = get_eager_double_grad(
-        func, x_init, dy_init, return_mid_result=True)
+        func, x_init, dy_init, place, return_mid_result=True)
 
     # calcluate third derivative
     dddys = []
@@ -835,7 +865,6 @@ def triple_grad_check_for_dygraph(func,
         y (Variable|list[Variable]): output variables to the program.
         x_init (numpy.array|list[numpy.array]|None): the init value for input x.
         place (fluid.CPUPlace or fluid.CUDAPlace): the device.
-        eps (float): perturbation for finite differences.
         atol (float): absolute tolerance.
         rtol (float): relative tolerance.
         raise_exception (bool): whether to raise an exception if
@@ -864,17 +893,23 @@ def fail_test(msg):
 
     paddle.disable_static()
     with _test_eager_guard():
-        eager_triple_grad = get_eager_triple_grad(func, x_init, y_grads_init)
+        eager_triple_grad = get_eager_triple_grad(func, x_init, y_grads_init,
+                                                  place)
     paddle.enable_static()
 
     static_triple_grad = get_static_triple_grad(x, y, x_init, y_grads_init,
                                                 place)
 
+    if len(static_triple_grad) != len(eager_triple_grad):
+        msg = "The output grad tensor's number of static graph is different with dygraph, " \
+            "please check the python api unit test used."
+        raise RuntimeError(msg)
+
     for i in six.moves.xrange(len(static_triple_grad)):
         if not np.allclose(static_triple_grad[i], eager_triple_grad[i], rtol,
                            atol):
-            msg = 'Check eager double result fail. Mismatch between static_graph double grad %s ' \
-                'and eager double grad %s on %s,\n' \
+            msg = 'Check eager double result fail. Mismatch between static_graph double grad ' \
+                'and eager double grad on %s, the output double grad tensor\'s index is : %d \n' \
                 'static:%s\n eager:%s\n' \
-                % (static_triple_grad[i].name, eager_triple_grad[i].name, str(place), static_triple_grad[i], eager_triple_grad[i])
+                % (str(place), i, static_triple_grad[i], eager_triple_grad[i])
             return fail_test(msg)
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
index 79a2430a16170..4826b37512614 100644
--- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -11,4 +11,5 @@ if(WITH_IPU)
     set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
+    set_tests_properties(test_save_load_ipu PROPERTIES TIMEOUT 600)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 26fd42be6cd27..2583d9409a0a7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -15,9 +15,10 @@
 import os
 import random
 import unittest
-import numpy as np
 from enum import IntEnum
+from typing import Dict, List, Optional
 
+import numpy as np
 import paddle
 import paddle.static
 
@@ -33,31 +34,27 @@
 }
 
 
+def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
+    return map_np_dtype_to_fluid_dtype[dtype.name]
+
+
 class ExecutionModeFull(IntEnum):
     # Run fp32 model on cpu
     CPU_FP32 = 1
     # Run fp32 model on ipu
     IPU_FP32 = 2
-    # Convert model to fp16 using popart transform
+    # Convert model to fp16 using mixed-precision approch
     # All parameters will be converted to fp16
-    # TODO rename to IPU_FP16
-    IPU_POPART_FP16 = 3
-    # Mix-precision mode, using `paddle.static.amp.fp16_guard()` to control the
-    # precision of each operator
-    IPU_MIXED_PRECISION = 4
+    IPU_FP16 = 3
 
 
 class ExecutionMode(IntEnum):
     CPU_FP32 = ExecutionModeFull.CPU_FP32
     IPU_FP32 = ExecutionModeFull.IPU_FP32
-    IPU_POPART_FP16 = ExecutionModeFull.IPU_POPART_FP16
-
+    IPU_FP16 = ExecutionModeFull.IPU_FP16
 
-def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
-    return map_np_dtype_to_fluid_dtype[dtype.name]
 
-
-class IPUOpTest(unittest.TestCase):
+class IPUTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         # Get random seeds
@@ -67,12 +64,7 @@ def setUpClass(cls):
         cls.SEED = 2021
         np.random.seed(cls.SEED)
         random.seed(cls.SEED)
-
-        # For ipu, most ops support fp16
-        cls.amp_list = paddle.static.amp.CustomOpLists(
-            custom_black_list=[], custom_white_list=[])
-        cls.amp_list.unsupported_list = {}
-        cls.amp_list.black_list = {}
+        paddle.seed(cls.SEED)
 
         # Enable paddle static graph mode
         paddle.enable_static()
@@ -83,6 +75,7 @@ def tearDownClass(cls):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
+    # Check if ipumodel mode is enabled
     @classmethod
     def use_ipumodel(cls):
         if 'POPLAR_IPUMODEL' not in os.environ:
@@ -92,6 +85,69 @@ def use_ipumodel(cls):
             if flag.upper() in ['1', "TRUE"]:
                 return True
 
+    # Decorator for static graph building
+    def static_graph(builder):
+        def wrapper(self, *args, **kwargs):
+            self.scope = paddle.static.Scope()
+            self.main_prog = paddle.static.Program()
+            self.startup_prog = paddle.static.Program()
+            self.main_prog.random_seed = self.SEED
+            self.startup_prog.random_seed = self.SEED
+            with paddle.static.scope_guard(self.scope):
+                with paddle.utils.unique_name.guard(
+                        paddle.utils.unique_name.generate('')):
+                    with paddle.static.program_guard(self.main_prog,
+                                                     self.startup_prog):
+                        builder(self, *args, **kwargs)
+
+        return wrapper
+
+    # Cast a fp32 model to a full-fp16 model
+    @classmethod
+    def cast_model_to_fp16(cls, main_program):
+        amp_list = paddle.static.amp.CustomOpLists()
+        amp_list.unsupported_list = {}
+        to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
+            main_program, amp_list, use_fp16_guard=False)
+        paddle.static.amp.cast_parameters_to_fp16(
+            paddle.CPUPlace(),
+            main_program,
+            to_fp16_var_names=to_fp16_var_names)
+
+
+class IPUOpTest(IPUTest):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+        # Items that a op_tester needs
+        cls.main_prog: paddle.static.Program = None
+        cls.startup_prog: paddle.static.Program = None
+        cls.scope: paddle.static.Scope = None
+        cls.feed_list: List[str] = None
+        cls.fetch_list: List[str] = None
+        cls.output_dict: Optional[Dict] = {}
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def skip_mode(self, exec_mode):
+        if exec_mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+            return True
+        else:
+            return False
+
+    def is_ipu_mode(self, exec_mode):
+        if exec_mode == ExecutionMode.CPU_FP32:
+            return False
+        return True
+
+    def is_fp16_mode(self, exec_mode):
+        if exec_mode != ExecutionMode.IPU_FP16:
+            return False
+        return True
+
     def set_atol(self):
         self.atol = 1e-10
         self.rtol = 1e-6
@@ -102,55 +158,90 @@ def set_training(self):
         self.is_training = False
         self.epoch = 1
 
-    def check(self, outputs, check_shape=False):
-        cpu_fp32 = outputs[ExecutionMode.CPU_FP32]
-        ipu_fp32 = outputs[ExecutionMode.IPU_FP32]
-        max_diff = np.abs(cpu_fp32 - ipu_fp32).max()
-        fp32_flag = np.allclose(
-            cpu_fp32, ipu_fp32, rtol=self.rtol, atol=self.atol)
-        self.assertTrue(fp32_flag, "max diff is %f" % (max_diff))
+    def run_op_test(self, exec_mode, ipu_strategy=None):
+        # NOTE: some op has no inputs
+        # if len(self.feed_list) == 0 or len(self.fetch_list) == 0:
+        #     raise ValueError('feed_list or fetch_list is empty')
+        if self.is_ipu_mode(exec_mode):
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+        if self.is_ipu_mode(exec_mode):
+            if ipu_strategy is None:
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+            if self.is_fp16_mode(exec_mode):
+                ipu_strategy.set_precision_config(enable_fp16=True)
+                IPUOpTest.cast_model_to_fp16(self.main_prog)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        feed = self.feed_fp32
+        if self.is_fp16_mode(exec_mode):
+            feed = self.feed_fp16
+
+        if self.is_training:
+            result = []
+            for _ in range(self.epoch):
+                loss_res = exe.run(program,
+                                   feed=feed,
+                                   fetch_list=self.fetch_list)
+                result.append(loss_res)
+        else:
+            result = exe.run(program, feed=feed, fetch_list=self.fetch_list)
+
+        if isinstance(result, list) and len(result) == 1:
+            self.output_dict[exec_mode] = result[0]
+        else:
+            self.output_dict[exec_mode] = result
+
+    def check(self, check_shape=False, output_dict=None):
+        if output_dict is None:
+            output_dict = self.output_dict
+        if len(output_dict) == 0:
+            raise ValueError("output_dict is empty")
+        cpu_fp32 = output_dict[ExecutionMode.CPU_FP32]
+        ipu_fp32 = output_dict[ExecutionMode.IPU_FP32]
+        cpu_fp32 = np.asarray(cpu_fp32).astype(np.float32).flatten()
+        ipu_fp32 = np.asarray(ipu_fp32).astype(np.float32).flatten()
+        pass_check = np.allclose(
+            ipu_fp32, cpu_fp32, rtol=self.rtol, atol=self.atol)
+        if not pass_check:
+            max_atol = np.abs(ipu_fp32 - cpu_fp32).max()
+            cpu_fp32_abs = np.abs(cpu_fp32)
+            cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20
+            max_rtol = (np.abs(ipu_fp32 - cpu_fp32) / cpu_fp32_abs).max()
+            raise AssertionError(
+                f"ipu_fp32 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}"
+            )
 
         if check_shape:
             self.assertTrue(cpu_fp32.shape == ipu_fp32.shape)
 
-        ipu_popart_fp16 = None
-        if ExecutionMode.IPU_POPART_FP16 in outputs.keys():
-            ipu_popart_fp16 = outputs[ExecutionMode.IPU_POPART_FP16]
-            max_diff = np.abs(ipu_popart_fp16.astype(np.float32) -
-                              cpu_fp32).max()
-            fp16_flag = np.allclose(
-                ipu_popart_fp16.astype(np.float32),
-                cpu_fp32,
-                rtol=self.rtol_fp16,
-                atol=self.atol_fp16)
-            self.assertTrue(fp16_flag, "max diff is %f" % (max_diff))
+        if ExecutionMode.IPU_FP16 in output_dict.keys():
+            ipu_fp16 = output_dict[ExecutionMode.IPU_FP16]
+            ipu_fp16 = np.asarray(ipu_fp16).astype(np.float32).flatten()
+            pass_check = np.allclose(
+                ipu_fp16, cpu_fp32, rtol=self.rtol_fp16, atol=self.atol_fp16)
+            if not pass_check:
+                max_atol = np.abs(ipu_fp16 - cpu_fp32).max()
+                cpu_fp32_abs = np.abs(cpu_fp32)
+                cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20
+                max_rtol = (np.abs(ipu_fp16 - cpu_fp32) / cpu_fp32_abs).max()
+                raise AssertionError(
+                    f"ipu_fp16 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}"
+                )
 
             if check_shape:
-                self.assertTrue(ipu_popart_fp16.shape == cpu_fp32.shape)
-
-            ipu_mixed_precision = None
-            if ExecutionModeFull.IPU_MIXED_PRECISION in outputs.keys():
-                ipu_mixed_precision = outputs[
-                    ExecutionModeFull.IPU_MIXED_PRECISION]
-                max_diff = np.abs(
-                    ipu_mixed_precision.astype(np.float32) - cpu_fp32).max()
-                fp16_flag = np.allclose(
-                    ipu_mixed_precision.astype(np.float32),
-                    cpu_fp32,
-                    rtol=self.rtol_fp16,
-                    atol=self.atol_fp16)
-                self.assertTrue(fp16_flag, "max diff is %f" % (max_diff))
-
-                if check_shape:
-                    self.assertTrue(ipu_mixed_precision.shape == cpu_fp32.shape)
-
-            if ExecutionMode.IPU_POPART_FP16 in outputs.keys(
-            ) and ExecutionModeFull.IPU_MIXED_PRECISION in outputs.keys():
-                max_diff = np.abs(ipu_popart_fp16 - ipu_mixed_precision).max()
-                self.assertEqual(ipu_popart_fp16.all(),
-                                 ipu_mixed_precision.all(),
-                                 "max diff is %f" % (max_diff))
-
-                if check_shape:
-                    self.assertTrue(
-                        ipu_popart_fp16.shape == ipu_mixed_precision.shape)
+                self.assertTrue(ipu_fp16.shape == cpu_fp32.shape)
+
+    # Execution Mode
+    class ExecutionMode(IntEnum):
+        CPU_FP32 = ExecutionModeFull.CPU_FP32
+        IPU_FP32 = ExecutionModeFull.IPU_FP32
+        IPU_FP16 = ExecutionModeFull.IPU_FP16
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
index 138365b650f24..b90c3374db96e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
@@ -18,8 +18,7 @@
 import paddle
 import paddle.nn.functional as F
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,10 +31,6 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_test_op(self):
         self.op = paddle.fluid.layers.relu
         self.op_attrs = {}
@@ -49,60 +44,22 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = self.op(x, **self.op_attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = self.op(x, **self.op_attrs)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestTanh(TestRelu):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
index d14eba98ef5d7..c48ce75ccd9f3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,12 +30,8 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
-        data = np.random.uniform(size=[10, 1000])
+        data = np.random.uniform(size=[10, 500]).astype(np.float16)
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
         self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
@@ -48,64 +43,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.argmax(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0].astype(np.int32)
-
-    def test_base(self):
-        output_dict_fp32 = {}
-        output_dict_fp16 = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-
-            if mode > ExecutionMode.IPU_FP32:
-                output_dict_fp16[mode] = self._test_base(mode).flatten()
-            else:
-                output_dict_fp32[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict_fp32)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.argmax(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        for k, v in self.output_dict.items():
+            self.output_dict[k] = v.astype(np.int32)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
index 35f4ca17d5eba..1239a97f2f653 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -29,10 +29,6 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -42,60 +38,23 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                assign = paddle.assign(x)
-                out = paddle.fluid.layers.elementwise_add(assign, assign)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.assign(x)
+        out = paddle.fluid.layers.elementwise_add(x, x)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestAssignFp32Value(TestBase):
@@ -107,51 +66,13 @@ def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.assign_fp32 = data.astype(np.float32)
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                assign = paddle.assign(self.assign_fp32)
-                out = paddle.fluid.layers.elementwise_add(x, assign)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        assign = paddle.assign(self.assign_fp32)
+        out = paddle.fluid.layers.elementwise_add(x, assign)
+        self.fetch_list = [out.name]
 
 
 class TestAssignBoolValue(TestBase):
@@ -162,52 +83,15 @@ def set_data_feed(self):
         data = np.random.choice([True, False], size=(2, 3, 1))
         self.assign_bool = data.astype(np.bool)
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                x = paddle.less_than(x, x)
-                assign = paddle.assign(self.assign_bool)
-                out = paddle.logical_and(x, assign)
-                out = paddle.cast(out, 'float32')
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.less_than(x, x)
+        assign = paddle.assign(self.assign_bool)
+        x = paddle.logical_and(x, assign)
+        out = paddle.cast(x, 'float32')
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
index f34e5b0d8b9dc..cf494034fd86f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -29,10 +29,6 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 2e-6
         self.rtol = 1e-5
@@ -48,67 +44,32 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                x = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                x = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                x = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                x = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-
-                fetch_list = [x.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                ipu_strategy.set_options({'need_avg_shard': True})
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        self.fetch_list = [x.name]
+
+    def run_model(self, exec_mode):
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(is_training=self.is_training)
+        ipu_strategy.set_options({'need_avg_shard': True})
+        self.run_op_test(exec_mode, ipu_strategy)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index c640cd441f1b2..adb2abfc47418 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-5
@@ -56,61 +51,24 @@ def set_op_attrs(self):
         self.attrs['data_layout'] = 'NCHW'
         self.attrs['in_place'] = False
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                conv1 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                out = paddle.fluid.layers.batch_norm(conv1, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.fluid.layers.batch_norm(x, **self.attrs)
+        self.fetch_list = [x.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
deleted file mode 100644
index ef61e651b2ad9..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import paddle
-import paddle.static
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestFunc(unittest.TestCase):
-    def _test_func(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        bps = 5
-        n = 1 if run_ipu else -1
-        c, h, w = 3, 10, 10
-        np_image = np.random.uniform(size=[1 * bps, c, h, w]).astype(np.float32)
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[n, c, h, w], dtype='float32')
-                conv2d = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-
-                out = conv2d
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [out.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=False)
-                ipu_strategy.set_pipelining_config(batches_per_step=bps)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program,
-                             feed={image.name: np_image},
-                             fetch_list=[out])
-            return result[0]
-
-    def test_func(self):
-        ipu_res = self._test_func(True)
-        cpu_res = self._test_func(False)
-
-        if np.prod(ipu_res.shape) == np.prod(cpu_res.shape):
-            ipu_res = ipu_res.reshape(cpu_res.shape)
-
-        self.assertTrue(np.allclose(ipu_res, cpu_res, atol=1e-4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
index 2de23d95e1c96..4d412f2a79977 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
@@ -30,175 +30,81 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    def set_atol(self):
-        self.atol = 1e-3
+    @property
+    def fp16_enabled(self):
+        return False
 
     def set_data_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
-        }
+        data = np.random.uniform(size=[1, 3, 3, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float16'
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = paddle.cast(x, **self.attrs)
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0],
+            shape=self.feed_shape[0],
+            dtype=self.feed_dtype[0])
+        out = paddle.cast(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestEnableFp16(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def set_data_feed(self):
-        self.feed = {"x": np.array([1, 200, 3000, 40000]).astype('int32'), }
+        data = np.random.uniform(size=[1, 3, 3, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
 
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float32'
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = paddle.cast(x, **self.attrs)
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
 
 class TestDisableTransferCast(TestEnableFp16):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {"x": np.array([1, 200, 3000, 40000]).astype('int32'), }
+        data = np.random.uniform(size=[1, 3, 3, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
 
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float32'
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = paddle.cast(x, **self.attrs)
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                ipu_strategy.set_precision_config(enable_fp16=True)
-                ipu_strategy.set_options({"transfer_cast_op": False})
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(is_training=self.is_training)
+        ipu_strategy.set_options({"transfer_cast_op": False})
+        self.run_op_test(exec_mode)
 
 
 class TestCase2(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
         }
 
@@ -208,11 +114,8 @@ def set_op_attrs(self):
 
 
 class TestCase3(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
         }
 
@@ -222,11 +125,8 @@ def set_op_attrs(self):
 
 
 class TestCase4(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
         }
 
@@ -236,11 +136,8 @@ def set_op_attrs(self):
 
 
 class TestCase5(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
         }
 
@@ -250,11 +147,8 @@ def set_op_attrs(self):
 
 
 class TestCase6(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
         }
 
@@ -273,7 +167,7 @@ def set_op_attrs(self):
 @unittest.skip('skip float16 to float32')
 class TestCase3(TestBase):
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
         }
 
@@ -285,10 +179,11 @@ def set_op_attrs(self):
 @unittest.skip('int32 to int8 is not supported')
 class TestCase4(TestBase):
     def set_atol(self):
+        super().set_atol()
         self.atol = 1
 
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.randint(
                 low=1, high=100, size=[1, 3, 3, 3]).astype('int32'),
         }
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
index c5a8090283940..a5410ab499082 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,14 +30,9 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data1 = np.random.uniform(size=[1, 3, 10, 10])
         data2 = np.random.uniform(size=[1, 3, 10, 10])
-
         self.feed_fp32 = {
             'x': data1.astype(np.float32),
             'y': data2.astype(np.float32)
@@ -55,63 +49,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.concat([x, y], **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.concat([x, y], **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
index ade54fda86929..e450621b11d34 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -26,26 +26,19 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_data_feed()
-        self.set_feed_attr()
+        self.set_feed()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-6
         self.atol_fp16 = 1e-3
         self.rtol_fp16 = 1e-3
 
-    def set_data_feed(self):
+    def set_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
         self.feed_fp16 = {'in_0': data.astype(np.float16)}
-
-    def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
@@ -59,59 +52,22 @@ def set_op_attrs(self):
         self.attrs['groups'] = 1
         self.attrs['data_format'] = 'NCHW'
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.conv2d(image, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.fluid.layers.conv2d(x, **self.attrs)
+        self.fetch_list = [x.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
index 3a21f0cb0079c..d035673e219df 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[3, 7])
         label = np.arange(3).reshape([3, 1])
@@ -53,81 +49,31 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {'soft_label': False, }
 
-    def np_nll_loss(self):
-        tmp = -np.log(self.feed_fp32['x'])
-        label = self.feed_fp32['label']
-        indice = [range(label.shape[0]), label.flatten()]
-        self.np_ref = tmp[indice]
-
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="float32")
-
-                if exec_mode != ExecutionMode.CPU_FP32:
-                    label = paddle.static.data(
-                        name=self.feed_list[1],
-                        shape=self.feed_shape[1],
-                        dtype='int32')
-                else:
-                    label = paddle.static.data(
-                        name=self.feed_list[1],
-                        shape=self.feed_shape[1],
-                        dtype='int64')
-
-                out = paddle.fluid.layers.cross_entropy(
-                    input=x, label=label, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed['label'] = feed['label'].astype(np.int32)
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        if on_ipu:
+            label = paddle.static.data(
+                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        else:
+            label = paddle.static.data(
+                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int64')
+        out = paddle.fluid.layers.cross_entropy(
+            input=x, label=label, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        if self.is_ipu_mode(exec_mode):
+            self.feed_fp32['label'] = self.feed_fp32['label'].astype(np.int32)
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-        self.np_nll_loss()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -142,7 +88,6 @@ class TestCase2(TestBase):
     def set_data_feed(self):
         x = np.random.uniform(size=[30, 70])
         label = np.arange(30).reshape([30, 1])
-
         self.feed_fp32 = {
             "x": x.astype(np.float32),
             "label": label.astype(np.int64)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
index 2f1d86daf0057..a0a145fb72b35 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -48,60 +48,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="float32")
-
-                out = paddle.fluid.layers.cumsum(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        out = paddle.fluid.layers.cumsum(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
index e34da7f70167a..4e3b03ffca068 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'x': data.astype(np.float32)}
@@ -51,60 +46,23 @@ def set_op_attrs(self):
             "dropout_implementation": "downgrade_in_infer"
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                dropout = paddle.fluid.layers.dropout(x, **self.attrs)
-                out = paddle.fluid.layers.elementwise_add(dropout, dropout)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.fluid.layers.dropout(x, **self.attrs)
+        out = paddle.fluid.layers.elementwise_add(x, x)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
index a9d6d2308326e..24082fe49bae5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -43,63 +42,24 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = self.op(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = self.op(x, y, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def run_test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def run_test_base(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
     def test_case0(self):
         data_x = np.random.uniform(size=(2, 3, 4, 5))
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
index 5b18c73851324..56b9a73f08009 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.ones([1, 10])
         y = np.zeros([1, 10])
@@ -53,63 +49,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.equal(x, y, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.equal(x, y, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
index 966dfdef87b54..211aa4a61a5b8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"expand_times": [1, 2, 2]}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="float32")
-
-                out = paddle.fluid.layers.expand(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        out = paddle.fluid.layers.expand(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -116,53 +75,15 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="float32")
-
-                expand_times = paddle.fluid.layers.fill_constant(
-                    shape=[len(self.feed_shape[0])], dtype="int32", value=2)
-                out = paddle.fluid.layers.expand(
-                    x, expand_times=expand_times, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        expand_times = paddle.fluid.layers.fill_constant(
+            shape=[len(self.feed_shape[0])], dtype="int32", value=2)
+        out = paddle.fluid.layers.expand(
+            x, expand_times=expand_times, **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
index 00b855a5a7a42..b3faabda3cdf2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -46,60 +42,23 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {'fill_value': 0.3, 'dtype': 'float32'}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                x_fill = paddle.full_like(x, **self.attrs)
-                out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x_fill = paddle.full_like(x, **self.attrs)
+        out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
index 3a1c202bf1133..ce457b7abeb5b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,17 +30,14 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
-        self.feed = {}
+        self.feed_fp32 = {}
+        self.feed_fp16 = {}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {
@@ -50,50 +47,21 @@ def set_op_attrs(self):
             'value': 0.3,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.fluid.layers.fill_constant(**self.attrs)
-                out = paddle.fluid.layers.elementwise_add(x, x)
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.fluid.layers.fill_constant(**self.attrs)
+        out = paddle.fluid.layers.elementwise_add(x, x)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
index 6f0cafc66805e..a8d530f6b77ad 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 2, 4, 6])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_op_attrs(self):
         self.attrs = {}
         self.attrs['axis'] = 1
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.flatten(x=x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.flatten(x=x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
deleted file mode 100644
index cd29ff705b88f..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import shutil
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestBase(IPUOpTest):
-    def setUp(self):
-        self.set_atol()
-        self.set_data_feed()
-        self.set_feed_attr()
-        self.set_op_attrs()
-
-    def set_atol(self):
-        self.atol = 1e-6
-        self.rtol = 1e-5
-        self.atol_fp16 = 1e-2
-        self.rtol_fp16 = 1e-3
-
-    def set_data_feed(self):
-        data = np.random.uniform(size=[1, 3, 10, 10])
-        self.feed_fp32 = {"in_0": data.astype(np.float32)}
-        self.feed_fp16 = {"in_0": data.astype(np.float16)}
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
-        self.feed_list = list(self.feed_fp32.keys())
-
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'sgd'
-        self.attrs['path'] = 'model'
-        self.attrs['model_name'] = 'test'
-
-    def _test_save(self):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-        generator = paddle.fluid.unique_name.UniqueNameGenerator()
-        self.full_name = '/'.join(
-            [self.attrs['path'], self.attrs['model_name']])
-
-        with paddle.fluid.unique_name.guard(generator):
-            with paddle.static.scope_guard(scope):
-                with paddle.static.program_guard(main_prog, startup_prog):
-                    x = paddle.static.data(
-                        name=self.feed_list[0],
-                        shape=self.feed_shape[0],
-                        dtype='float32')
-
-                    scale = paddle.fluid.layers.scale(
-                        x, scale=1.0, bias=0.0, bias_after_scale=True)
-                    conv = paddle.static.nn.conv2d(
-                        scale,
-                        num_filters=3,
-                        filter_size=3,
-                        bias_attr=False,
-                        name='conv2d')
-                    loss = paddle.mean(conv)
-
-                    if self.attrs['is_training']:
-                        if self.attrs['opt_type'] == 'sgd':
-                            sgd = paddle.optimizer.SGD(learning_rate=1e-2)
-                            sgd.minimize(loss)
-                        elif self.attrs['opt_type'] == 'adam':
-                            adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                            adam.minimize(loss)
-                        elif self.attrs['opt_type'] == 'lamb':
-                            lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
-                            lamb.minimize(loss)
-
-                fetch_list = [loss.name]
-
-                place = paddle.IPUPlace()
-                exe = paddle.static.Executor(place)
-                exe.run(startup_prog)
-
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=True)
-                ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(
-                        self.feed_list, fetch_list)
-
-                for _ in range(self.attrs['steps']):
-                    exe.run(program, feed=self.feed_fp16, fetch_list=fetch_list)
-
-                paddle.static.save_inference_model(
-                    self.full_name, x, loss, exe, program=program.org_program)
-
-    def _test_load(self, run_ipu):
-        if run_ipu:
-            place = paddle.IPUPlace()
-        else:
-            place = paddle.CPUPlace()
-        exe = paddle.static.Executor(place)
-
-        [inference_program, feed_target_names, fetch_targets] = (
-            paddle.static.load_inference_model(self.full_name, exe))
-
-        if run_ipu:
-            feed_list = feed_target_names
-            fetch_list = [fetch_targets[0].name]
-            ipu_strategy = paddle.static.IpuStrategy()
-            ipu_strategy.set_graph_config(is_training=False)
-            ipu_strategy.set_precision_config(enable_fp16=True)
-            program = paddle.static.IpuCompiledProgram(
-                inference_program,
-                ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-        else:
-            program = inference_program
-
-        feed = self.feed_fp16 if run_ipu else self.feed_fp32
-        result = []
-        for i in range(10):
-            feed["in_0"] += np.array([1.1 * i]).astype(feed["in_0"].dtype)
-            out = exe.run(program, feed=feed, fetch_list=[fetch_targets])
-            result.append(out)
-
-        return np.array(result)
-
-    def test_base(self):
-        self._test_save()
-        cpu_res = self._test_load(False)
-        ipu_res = self._test_load(True).astype(np.float32)
-
-        self.assertTrue(
-            np.allclose(
-                cpu_res, ipu_res, rtol=self.rtol_fp16, atol=self.atol_fp16))
-
-        shutil.rmtree(self.attrs['path'], True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
index 71742deefcd2c..1d3b17dbc2dfc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
@@ -16,9 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 5e-6
         self.rtol = 1e-5
@@ -54,80 +49,32 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                conv1 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                add1 = conv1 + conv2
-                conv3 = paddle.static.nn.conv2d(
-                    add1, num_filters=8, filter_size=8, bias_attr=False)
-                out = paddle.fluid.layers.relu(conv3, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
-
-
-class TestIntInput(IPUOpTest):
-    def setUp(self):
-        self.set_atol()
-        self.set_training()
-        self.set_data_feed()
-        self.set_feed_attr()
-        self.set_op_attrs()
-
-    @property
-    def fp16_enabled(self):
-        return True
-
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        conv2 = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        add1 = conv1 + conv2
+        conv3 = paddle.static.nn.conv2d(
+            add1, num_filters=8, filter_size=8, bias_attr=False)
+        out = paddle.fluid.layers.relu(conv3, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestIntInput(TestBase):
     def set_data_feed(self):
         embedding = np.random.uniform(size=[10, 20])
         indice = np.array([1, 3, 5]).astype(np.int32)
@@ -140,71 +87,14 @@ def set_data_feed(self):
             "indice": indice,
         }
 
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
-        self.feed_list = list(self.feed_fp32.keys())
-        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
-
-    def set_op_attrs(self):
-        self.attrs = {}
-
-    def _test_base(self, exec_mode):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='int32')
-
-                out = paddle.fluid.layers.gather(x, index=y)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return np.array(result)
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        out = paddle.fluid.layers.gather(x, index=y)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
index 01a56fd14be04..bbf3ec0ffdfe6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[10, 20])
         y = np.array([1, 3, 5])
@@ -47,63 +43,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='int32')
-
-                out = paddle.fluid.layers.gather(x, index=y, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        out = paddle.fluid.layers.gather(x, index=y, **self.attrs)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
index 602289f3f1904..e9721463876d0 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -46,59 +42,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"approximate": False}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.gelu(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.gelu(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
index 281baeca09e47..b7567f60cc3a2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
@@ -28,19 +28,26 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
         self.set_attrs()
+        self.set_training()
+
+    @property
+    def fp16_enabled(self):
+        return False
 
     def set_atol(self):
+        super().set_atol()
         self.atol = 1e-6
+        self.rtol = 1e-5
 
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "image": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_attrs(self):
         self.attrs = {
@@ -48,76 +55,48 @@ def set_attrs(self):
             "weight_decay": 0.0,
         }
 
-    def _test_optimizer(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-        np.random.seed(self.SEED)
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-                loss = paddle.mean(conv1)
-
-                weight_decay = self.attrs['weight_decay']
-                # Only support ClipGradByGlobalNorm
-                clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-                if self.attrs['optimizer'] == 'sgd':
-                    opt = paddle.optimizer.SGD(learning_rate=1e-1,
-                                               weight_decay=weight_decay,
-                                               grad_clip=clip)
-                elif self.attrs['optimizer'] == 'adam':
-                    opt = paddle.optimizer.Adam(
-                        learning_rate=1e-1,
-                        weight_decay=weight_decay,
-                        grad_clip=clip)
-                elif self.attrs['optimizer'] == 'lamb':
-                    opt = paddle.optimizer.Lamb(
-                        learning_rate=1e-1,
-                        lamb_weight_decay=weight_decay,
-                        grad_clip=clip)
-                else:
-                    raise ValueError(
-                        f"Not supported optimizer {self.attrs['optimizer']} for test"
-                    )
-                opt.minimize(loss)
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [loss.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = []
-            for epoch in range(100):
-                loss_res = exe.run(program, feed=self.feed, fetch_list=[loss])
-                result.append(loss_res)
-
-            return np.array(result)
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 100
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        image = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            image, num_filters=3, filter_size=3, bias_attr=False)
+        loss = paddle.mean(conv1)
+        self.fetch_list = [loss.name]
+
+        weight_decay = self.attrs['weight_decay']
+        # Only support ClipGradByGlobalNorm
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+        if self.attrs['optimizer'] == 'sgd':
+            opt = paddle.optimizer.SGD(learning_rate=1e-1,
+                                       weight_decay=weight_decay,
+                                       grad_clip=clip)
+        elif self.attrs['optimizer'] == 'adam':
+            opt = paddle.optimizer.Adam(
+                learning_rate=1e-1, weight_decay=weight_decay, grad_clip=clip)
+        elif self.attrs['optimizer'] == 'lamb':
+            opt = paddle.optimizer.Lamb(
+                learning_rate=1e-1,
+                lamb_weight_decay=weight_decay,
+                grad_clip=clip)
+        else:
+            raise ValueError(
+                f"Not supported optimizer {self.attrs['optimizer']} for test")
+        opt.minimize(loss)
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
-        ipu_loss = self._test_optimizer(True).flatten()
-        cpu_loss = self._test_optimizer(False).flatten()
-
-        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=self.atol))
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestAdam(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
index 934ad10142827..c499bb0bd5ff9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -28,73 +28,30 @@ def setUp(self):
         self.set_training()
         self.set_test_op()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_test_op(self):
         self.op = paddle.fluid.layers.greater_than
 
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = self.op(x, y, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = self.op(x, y, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def run_test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def run_test_base(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index 102e764cb2f17..bb984a8d90789 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 3e-6
         self.rtol = 1e-6
@@ -56,86 +52,36 @@ def set_op_attrs(self):
             "data_layout": 'NCHW',
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                if self.is_training:
-                    ch = self.feed_shape[0][1]
-                    conv1 = paddle.static.nn.conv2d(
-                        x, num_filters=ch, filter_size=3, bias_attr=False)
-                    scale = paddle.ParamAttr(trainable=True)
-                    bias = paddle.ParamAttr(trainable=True)
-                    out = paddle.fluid.layers.nn.group_norm(
-                        conv1, param_attr=scale, bias_attr=bias, **self.attrs)
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                else:
-                    out = paddle.fluid.layers.nn.group_norm(
-                        x, param_attr=True, bias_attr=True, **self.attrs)
-
-                if self.is_training:
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            if mode > ExecutionMode.IPU_FP32 and self.is_training:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        if self.is_training:
+            ch = self.feed_shape[0][1]
+            conv1 = paddle.static.nn.conv2d(
+                x, num_filters=ch, filter_size=3, bias_attr=False)
+            scale = paddle.ParamAttr(trainable=True)
+            bias = paddle.ParamAttr(trainable=True)
+            out = paddle.fluid.layers.nn.group_norm(
+                conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            out = paddle.fluid.layers.nn.group_norm(
+                x, param_attr=True, bias_attr=True, **self.attrs)
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -150,7 +96,7 @@ def set_op_attrs(self):
 class TestTrainCase1(TestBase):
     def set_training(self):
         self.is_training = True
-        self.epoch = 10
+        self.epoch = 20
 
 
 @unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
@@ -170,7 +116,7 @@ def set_op_attrs(self):
 
     def set_training(self):
         self.is_training = True
-        self.epoch = 10
+        self.epoch = 20
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
index ed8f3950ace82..fa425cbf9f94a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-5
@@ -52,86 +48,37 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"epsilon": 1e-05}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                if self.is_training:
-                    ch = self.feed_shape[0][1]
-                    conv1 = paddle.static.nn.conv2d(
-                        x, num_filters=ch, filter_size=3, bias_attr=False)
-                    scale = paddle.ParamAttr(trainable=True)
-                    bias = paddle.ParamAttr(trainable=True)
-                    out = paddle.fluid.layers.nn.instance_norm(
-                        conv1, param_attr=scale, bias_attr=bias, **self.attrs)
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                else:
-                    out = paddle.fluid.layers.nn.instance_norm(
-                        x, param_attr=True, bias_attr=True, **self.attrs)
-
-                if self.is_training:
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res)
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+
+        if self.is_training:
+            ch = self.feed_shape[0][1]
+            conv1 = paddle.static.nn.conv2d(
+                x, num_filters=ch, filter_size=3, bias_attr=False)
+            scale = paddle.ParamAttr(trainable=True)
+            bias = paddle.ParamAttr(trainable=True)
+            out = paddle.fluid.layers.nn.instance_norm(
+                conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            out = paddle.fluid.layers.nn.instance_norm(
+                x, param_attr=True, bias_attr=True, **self.attrs)
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            if mode > ExecutionMode.IPU_FP32 and self.is_training:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestTrainCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api.py
deleted file mode 100644
index a306a3f7725b5..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_index : no
-
-        with paddle.static.ipu_shard_guard(index=1):
-            c = b + 1  # scale, ipu_index : 1
-            with paddle.static.ipu_shard_guard(index=2):
-                d = c * 2  # scale, ipu_index : 2
-            with paddle.static.ipu_shard_guard(index=3):
-                e = d + 3  # scale, ipu_index : 3
-                with paddle.static.ipu_shard_guard(index=1):
-                    e = e + 3  # scale, ipu_index : 1
-                    with paddle.static.ipu_shard_guard(index=2):
-                        e = e + 3  # scale, ipu_index : 2
-
-        with paddle.static.ipu_shard_guard(index=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
-
-        with paddle.static.ipu_shard_guard(index=2):
-            g = f - 1  # scale, ipu_index : 2
-
-        h = g + 1  # scale, ipu_index : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_index"):
-                ipu_index_list.append(op.desc.attr("ipu_index"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuPipeline(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_stage : no
-
-        with paddle.static.ipu_shard_guard(stage=1):
-            c = b + 1  # scale, ipu_stage : 1
-            with paddle.static.ipu_shard_guard(stage=2):
-                d = c * 2  # scale, ipu_stage : 2
-            with paddle.static.ipu_shard_guard(stage=3):
-                e = d + 3  # scale, ipu_stage : 3
-                with paddle.static.ipu_shard_guard(stage=1):
-                    e = e + 3  # scale, ipu_stage : 1
-                    with paddle.static.ipu_shard_guard(stage=2):
-                        e = e + 3  # scale, ipu_stage : 2
-
-        with paddle.static.ipu_shard_guard(stage=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
-
-        with paddle.static.ipu_shard_guard(stage=2):
-            g = f - 1  # scale, ipu_stage : 2
-
-        h = g + 1  # scale, ipu_stage : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_stage"):
-                ipu_index_list.append(op.desc.attr("ipu_stage"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index debd9ed19827c..45f75f1b4df81 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -73,10 +73,15 @@ def test_set_other_options(self):
             'autoReport.directory': 'path',
             'autoReport.all': 'true'
         }
+        options['random_seed'] = 1234
         for k, v in options.items():
             ipu_strategy.set_options({k: v})
             assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
 
+        # The custom logger need 2 int as inputs
+        logger = lambda progress, total: print(f"compile progrss: {progress}/{total}")
+        ipu_strategy.set_options({'compilation_progress_logger': logger})
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
index a52946bba1567..cab2fa3fde2cb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-5
@@ -59,89 +55,48 @@ def set_op_attrs(self):
         }
         self.optimizer = None
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                if self.is_training:
-                    ch = self.feed_shape[0][1]
-                    conv1 = paddle.static.nn.conv2d(
-                        x, num_filters=ch, filter_size=3, bias_attr=False)
-                    scale = paddle.ParamAttr(trainable=True)
-                    bias = paddle.ParamAttr(trainable=True)
-                    out = paddle.fluid.layers.nn.layer_norm(
-                        conv1, param_attr=scale, bias_attr=bias, **self.attrs)
-                else:
-                    scale = self.attrs['scale']
-                    bias = self.attrs['shift']
-                    out = paddle.fluid.layers.nn.layer_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
-                loss = paddle.mean(out)
-
-                fetch_list = [loss.name]
-
-                if self.is_training:
-                    optimizer = None
-                    if self.optimizer == 'sgd':
-                        optimizer = paddle.optimizer.SGD(learning_rate=1e-2)
-                    elif self.optimizer == 'adam':
-                        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
-                    elif self.optimizer == 'lamb':
-                        optimizer = paddle.optimizer.Lamb(
-                            learning_rate=1e-2, lamb_weight_decay=0.0)
-                    if optimizer is not None:
-                        optimizer.minimize(loss)
-
-            if exec_mode:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=self.feed_fp32,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program,
-                                 feed=self.feed_fp32,
-                                 fetch_list=fetch_list)
-                return result[0]
-
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        if self.is_training:
+            ch = self.feed_shape[0][1]
+            conv1 = paddle.static.nn.conv2d(
+                x, num_filters=ch, filter_size=3, bias_attr=False)
+            scale = paddle.ParamAttr(trainable=True)
+            bias = paddle.ParamAttr(trainable=True)
+            out = paddle.fluid.layers.nn.layer_norm(
+                conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+            loss = paddle.mean(out)
+            self.fetch_list = [loss.name]
+        else:
+            scale = self.attrs['scale']
+            bias = self.attrs['shift']
+            out = paddle.fluid.layers.nn.layer_norm(
+                x, param_attr=scale, bias_attr=bias, **self.attrs)
+            self.fetch_list = [out.name]
+
+        if self.is_training:
+            optimizer = None
+            if self.optimizer == 'sgd':
+                optimizer = paddle.optimizer.SGD(learning_rate=1e-2)
+            elif self.optimizer == 'adam':
+                optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+            elif self.optimizer == 'lamb':
+                optimizer = paddle.optimizer.Lamb(
+                    learning_rate=1e-2, lamb_weight_decay=0.0)
+            if optimizer is not None:
+                optimizer.minimize(loss)
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 @unittest.skip('raise error')
@@ -188,33 +143,17 @@ def set_op_attrs(self):
         self.optimizer = 'sgd'
 
     def set_atol(self):
+        super().set_atol()
         self.atol = 1e-6
 
     def set_training(self):
         self.is_training = True
-        self.epoch = 10
-
-
-class TestTrainCase2(TestBase):
-    def set_atol(self):
-        self.atol = 5e-4
-
-    def set_op_attrs(self):
-        self.attrs = {
-            "scale": True,
-            "shift": True,
-            "begin_norm_axis": 2,
-            "epsilon": 1e-05
-        }
-        self.optimizer = 'adam'
-
-    def set_training(self):
-        self.is_training = True
-        self.epoch = 10
+        self.epoch = 20
 
 
 class TestTrainCase3(TestBase):
     def set_atol(self):
+        super().set_atol()
         self.atol = 5e-3
 
     def set_op_attrs(self):
@@ -228,7 +167,7 @@ def set_op_attrs(self):
 
     def set_training(self):
         self.is_training = True
-        self.epoch = 10
+        self.epoch = 20
 
 
 # not support `layer_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
index fad7516e442a7..c0e4865b3a627 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.nn.functional as F
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +31,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -49,59 +45,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = F.log_softmax(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = F.log_softmax(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
index 3f8472890d03e..725d2b3429a7f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -29,68 +29,32 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 20, 30528])
-        self.feed = {"in_0": data.astype('bool')}
+        self.feed_fp32 = {"in_0": data.astype('bool')}
+        self.feed_fp16 = {"in_0": data.astype('bool')}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
-
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="bool")
-
-                out = paddle.fluid.layers.logical_not(x)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).astype(np.int32)
-
-        self.check(output_dict, check_shape=True)
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="bool")
+        out = paddle.fluid.layers.logical_not(x)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
index 05572a72ea8b2..55a2c08c1b5e7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -38,69 +38,38 @@ def set_test_op(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
-
-                out = self.op(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0],
+            shape=self.feed_shape[0],
+            dtype=self.feed_dtype[0])
+        y = paddle.static.data(
+            name=self.feed_list[1],
+            shape=self.feed_shape[1],
+            dtype=self.feed_dtype[1])
+        out = self.op(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def run_test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).astype(np.int32)
-
-        self.check(output_dict, check_shape=True)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
         self.feed_dtype = ['bool', 'bool']
 
     def set_data_feed0(self):
         x = np.random.choice([True, False], size=(1, 3, 5, 5))
         y = np.random.choice([True, False], size=(1, 3, 5, 5))
-        self.feed = {
+        self.feed_fp32 = {
             "x": x.astype('bool'),
             "y": y.astype('bool'),
         }
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
index 4a877ddce4e3c..80636348cfad3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,19 +30,15 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
-        self.feed_cpu = {"x": data.astype(np.int64)}
-        self.feed_ipu = {"x": data.astype(np.int32)}
+        self.feed_fp32 = {"x": data.astype(np.int64)}
+        self.feed_fp16 = {"x": data.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
-        self.feed_list = list(self.feed_cpu.keys())
-        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {
@@ -53,76 +49,30 @@ def set_op_attrs(self):
             "dtype": 'float32'
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int64')
-
-                out = paddle.fluid.layers.embedding(x, **self.attrs)
-
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_cpu
-            if exec_mode > ExecutionMode.CPU_FP32:
-                feed = self.feed_ipu
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64')
+        out = paddle.fluid.layers.embedding(x, **self.attrs)
+        if self.is_training:
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        if self.is_ipu_mode(exec_mode):
+            self.feed_fp32['x'] = self.feed_fp32['x'].astype(np.int32)
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
-                                                  self.is_training):
-                break
-
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestTrainCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
index da8048fb3205e..7f021a615afa0 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,19 +30,15 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
-        self.feed_cpu = {"x": x.astype(np.int64)}
-        self.feed_ipu = {"x": x.astype(np.int32)}
+        self.feed_fp32 = {"x": x.astype(np.int64)}
+        self.feed_fp16 = {"x": x.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
-        self.feed_list = list(self.feed_cpu.keys())
-        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {
@@ -53,76 +49,31 @@ def set_op_attrs(self):
             "weight_attr": None
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int64')
-
-                embedding = paddle.nn.Embedding(**self.attrs)
-                out = embedding(x)
-
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_cpu
-            if exec_mode > ExecutionMode.CPU_FP32:
-                feed = self.feed_ipu
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64')
+        embedding = paddle.nn.Embedding(**self.attrs)
+        out = embedding(x)
+        if self.is_training:
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        if self.is_ipu_mode(exec_mode):
+            self.feed_fp32['x'] = self.feed_fp32['x'].astype(np.int32)
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
-                                                  self.is_training):
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestTrainCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
index 58f018e2ae649..6641efde69473 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
@@ -12,89 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import numpy as np
 import unittest
-import sys
 import paddle
-import paddle.fluid as fluid
 import paddle.static
 from paddle.optimizer.lr import LRScheduler
-
-paddle.enable_static()
-SEED = 2021
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 class LR_New(LRScheduler):
-    def __init__(self, learning_rate=1.0, last_epoch=-1, verbose=False):
+    def __init__(self, learning_rate=1e-5, last_epoch=-1, verbose=False):
         super(LR_New, self).__init__(learning_rate, last_epoch, verbose)
 
     def get_lr(self):
-        self.base_lr = self.base_lr + 1
+        self.base_lr = self.base_lr + 1e-4
         self.last_epoch = self.last_epoch + 1
         return self.base_lr
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
-class TestConvNet(unittest.TestCase):
-    def _test(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-                loss = paddle.mean(conv1)
-
-                sgd = paddle.optimizer.SGD(learning_rate=LR_New())
-                sgd.minimize(loss)
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [loss.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = []
-            for epoch in range(100):
-                if hasattr(program, "lr_sheduler"):
-                    program.lr_sheduler.step()
-                loss_res = exe.run(program,
-                                   feed={image.name: np_image},
-                                   fetch_list=[loss])
-                result.append(loss_res)
-
-            return np.array(result)
+class TestConvNet(IPUOpTest):
+    @IPUOpTest.static_graph
+    def build_model(self):
+        image = paddle.static.data(
+            name='image', shape=[1, 3, 10, 10], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            image, num_filters=3, filter_size=3, bias_attr=False)
+        loss = paddle.mean(conv1)
+
+        opt = paddle.optimizer.Lamb(learning_rate=LR_New())
+        opt.minimize(loss)
+        self.feed_list = [image.name]
+        self.fetch_list = [loss.name]
+
+    def run_model(self, run_ipu=True):
+        self.build_model()
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+        if run_ipu:
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(is_training=True)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        result = []
+        for _ in range(100):
+            if hasattr(program, "lr_sheduler"):
+                program.lr_sheduler.step()
+            loss_res = exe.run(program,
+                               feed=self.feed,
+                               fetch_list=self.fetch_list)
+            result.append(loss_res)
+        return np.array(result)
 
     def test_training(self):
+        data = np.random.rand(1, 3, 10, 10).astype(np.float32)
+        self.feed = {'image': data}
         # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
-        ipu_loss = self._test(True).flatten()
-        cpu_loss = self._test(False).flatten()
+        ipu_loss = self.run_model(True).flatten()
+        cpu_loss = self.run_model(False).flatten()
 
-        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4))
+        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-10))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index 6929ded6ebf90..e7e4c000e16a2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[20, 30])
         y = np.random.uniform(size=[30, 20])
@@ -52,63 +48,25 @@ def set_op_attrs(self):
             "alpha": 1.0,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.matmul(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+
+        out = paddle.fluid.layers.matmul(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
index ddb06400540e3..0a273e91dd571 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
@@ -26,7 +26,7 @@ def set_serialize_factor(serialize_factor):
     op._set_attr('serialize_factor', serialize_factor)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu() or IPUOpTest.use_ipumodel(),
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
@@ -38,8 +38,8 @@ def setUp(self):
 
     def set_data_feed(self):
         self.feed = {
-            "x": np.random.uniform(size=[2048, 3072]).astype('float32'),
-            "y": np.random.uniform(size=[3072, 2048]).astype('float32'),
+            "x": np.random.uniform(size=[16, 32]).astype('float32'),
+            "y": np.random.uniform(size=[32, 16]).astype('float32'),
         }
 
     def set_feed_attr(self):
@@ -50,58 +50,47 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"transpose_x": False, "transpose_y": False}
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
-
-                # decrator maybe the best choice, but need to modify api
-                out = paddle.matmul(x, y, **self.attrs)
-                set_serialize_factor(4)
-
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0],
+            shape=self.feed_shape[0],
+            dtype=self.feed_dtype[0])
+        y = paddle.static.data(
+            name=self.feed_list[1],
+            shape=self.feed_shape[1],
+            dtype=self.feed_dtype[1])
+        # decrator maybe the best choice, but need to modify api
+        out = paddle.matmul(x, y, **self.attrs)
+        set_serialize_factor(4)
+        self.fetch_list = [out.name]
+
+    def run_model(self, run_ipu):
+        self.build_model()
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+        if run_ipu:
+            feed_list = self.feed_list
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(is_training=self.is_training)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog,
+                ipu_strategy=ipu_strategy).compile(feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+        result = exe.run(program, feed=self.feed, fetch_list=self.fetch_list)
+        return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
+        res0 = self.run_model(False)
+        res1 = self.run_model(True)
         self.assertTrue(
             np.allclose(
                 res0.flatten(), res1.flatten(), atol=self.atol))
-
         self.assertTrue(res0.shape == res1.shape)
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
index 9f1c115403adf..725f3243e0f3d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[2, 3])
         y = np.random.uniform(size=[3, 2])
@@ -48,63 +44,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"transpose_x": False, "transpose_y": False}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.matmul(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.matmul(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
index b9dd7358b7955..c0d7dd1fd171d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -46,59 +42,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.mean(x)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.mean(x)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
index a70550c1df702..9bdf233556012 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.static
 import paddle.nn.functional as F
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionModeFull
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -28,10 +28,7 @@ def setUp(self):
         self.set_atol()
         self.set_data_feed()
         self.set_feed_attr()
-
-    @property
-    def fp16_enabled(self):
-        return True
+        self.set_attrs()
 
     def set_atol(self):
         self.atol = 1e-6
@@ -42,7 +39,6 @@ def set_atol(self):
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 10, 27, 27])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
-        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
@@ -54,86 +50,126 @@ def dtype_check(self, program, to_fp16_var_names):
         for var_name in to_fp16_var_names:
             assert (block.var(var_name).dtype, paddle.float16)
 
-    def _test_base(self, exec_mode):
-        generator = paddle.fluid.unique_name.UniqueNameGenerator()
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.fluid.unique_name.guard(generator):
-            with paddle.static.scope_guard(scope):
-                with paddle.static.program_guard(main_prog, startup_prog):
-                    x = paddle.static.data(
-                        name=self.feed_list[0],
-                        shape=self.feed_shape[0],
-                        dtype='float32')
-
-                    # using fp32
-                    x = paddle.static.nn.conv2d(
-                        input=x, num_filters=3, filter_size=3)
-                    x = paddle.static.nn.batch_norm(x, act='relu')
-                    x = F.max_pool2d(x, kernel_size=2, stride=2)
-
-                    # using fp16
-                    with paddle.static.amp.fp16_guard():
-                        x = paddle.static.nn.conv2d(
-                            input=x, num_filters=6, filter_size=3)
-                        x = paddle.static.nn.batch_norm(x, act='relu')
-                        x = F.max_pool2d(x, kernel_size=2, stride=2)
-
-                    # using fp32
-                    x = paddle.static.nn.fc(x, size=10)
-                    loss = paddle.mean(x)
-                    fetch_list = [loss.name]
-
-                    if exec_mode == ExecutionModeFull.CPU_FP32:
-                        place = paddle.CPUPlace()
-                    else:
-                        place = paddle.IPUPlace()
-
-                    # cast model to fp16
-                    if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION:
-                        to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
-                            main_prog, self.amp_list)
-                        self.dtype_check(main_prog, to_fp16_var_names)
-
-                    exe = paddle.static.Executor(place)
-                    exe.run(startup_prog)
-
-                    # cast parameters to fp16
-                    if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION:
-                        paddle.static.amp.cast_parameters_to_fp16(
-                            paddle.CPUPlace(),
-                            main_prog,
-                            to_fp16_var_names=to_fp16_var_names)
-
-                    if exec_mode != ExecutionModeFull.CPU_FP32:
-                        ipu_strategy = paddle.static.IpuStrategy()
-                        ipu_strategy.set_graph_config(is_training=False)
-                        if exec_mode == ExecutionModeFull.IPU_POPART_FP16:
-                            ipu_strategy.set_precision_config(enable_fp16=True)
-                        program = paddle.static.IpuCompiledProgram(
-                            main_prog, ipu_strategy=ipu_strategy).compile(
-                                self.feed_list, fetch_list)
-                    else:
-                        program = main_prog
-
-                    feed = self.feed_fp32
-                    result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                    return result[0]
+    def set_attrs(self):
+        self.num_ipus = 1
+        self.enable_pipelining = False
+        self.enable_manual_shard = False
+        self.batches_per_step = 1
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+
+        # using fp32
+        x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
+        x = paddle.static.nn.batch_norm(x, act='relu')
+        x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # using fp16
+        with paddle.static.amp.fp16_guard():
+            x = paddle.static.nn.conv2d(input=x, num_filters=6, filter_size=3)
+            x = paddle.static.nn.batch_norm(x, act='relu')
+            x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # using fp32
+        x = paddle.static.nn.fc(x, size=10)
+        loss = paddle.mean(x)
+        self.fetch_list = [loss.name]
+
+    def run_model(self, exec_mode):
+        # cast model to fp16
+        if self.is_fp16_mode(exec_mode):
+            amp_list = paddle.static.amp.CustomOpLists()
+            amp_list.unsupported_list = {}
+            to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
+                self.main_prog, amp_list, use_fp16_guard=True)
+            self.dtype_check(self.main_prog, to_fp16_var_names)
+
+        if self.is_ipu_mode(exec_mode):
+            place = paddle.CPUPlace()
+        else:
+            place = paddle.IPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+
+        # cast parameters to fp16
+        if exec_mode == IPUOpTest.ExecutionMode.IPU_FP16:
+            paddle.static.amp.cast_parameters_to_fp16(
+                paddle.CPUPlace(),
+                self.main_prog,
+                to_fp16_var_names=to_fp16_var_names)
+
+        if self.is_ipu_mode(exec_mode):
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(
+                is_training=False,
+                num_ipus=self.num_ipus,
+                enable_manual_shard=self.enable_manual_shard)
+            ipu_strategy.set_pipelining_config(
+                enable_pipelining=self.enable_pipelining,
+                batches_per_step=self.batches_per_step)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        result = exe.run(program,
+                         feed=self.feed_fp32,
+                         fetch_list=self.fetch_list)
+        self.output_dict[exec_mode] = result[0]
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            self.build_model()
+            self.run_model(m)
+        self.check()
+
+
+class TestPipline(TestBase):
+    @IPUOpTest.static_graph
+    def build_model(self, exec_mode):
+        feed_shape = list(self.feed_shape[0])
+        if self.is_ipu_mode(exec_mode):
+            feed_shape[0] = 1
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=feed_shape, dtype='float32')
+        with paddle.static.ipu_shard_guard(index=0, stage=0):
+            # using fp32
+            x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
+            x = paddle.static.nn.batch_norm(x, act='relu')
+            x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        with paddle.static.ipu_shard_guard(index=1, stage=1):
+            # using fp16
+            with paddle.static.amp.fp16_guard():
+                x = paddle.static.nn.conv2d(
+                    input=x, num_filters=6, filter_size=3)
+                x = paddle.static.nn.batch_norm(x, act='relu')
+                x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        with paddle.static.ipu_shard_guard(index=2, stage=2):
+            # using fp32
+            x = paddle.static.nn.fc(x, size=10)
+            loss = paddle.mean(x)
+        self.fetch_list = [loss.name]
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[3, 10, 27, 27])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+
+    def set_attrs(self):
+        self.num_ipus = 3
+        self.enable_pipelining = True
+        self.enable_manual_shard = True
+        self.batches_per_step = 3
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionModeFull:
-            if mode == ExecutionModeFull.IPU_POPART_FP16:
-                continue
-            if mode > ExecutionModeFull.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            self.build_model(m)
+            self.run_model(m)
+        # skip check results
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
index 224c0bddc22f9..c4ac9cddd7c3f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.static
 import paddle.nn.functional as F
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionModeFull
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -29,10 +29,7 @@ def setUp(self):
         self.set_training()
         self.set_data_feed()
         self.set_feed_attr()
-
-    @property
-    def fp16_enabled(self):
-        return True
+        self.set_attrs()
 
     def set_atol(self):
         self.atol = 2e-6
@@ -47,104 +44,149 @@ def set_training(self):
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 28, 28])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
-        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
+    def set_attrs(self):
+        self.num_ipus = 1
+        self.enable_pipelining = False
+        self.enable_manual_shard = False
+        self.batches_per_step = 1
+
     def dtype_check(self, program, to_fp16_var_names):
         block = program.global_block()
         assert len(to_fp16_var_names) > 0
         for var_name in to_fp16_var_names:
             assert (block.var(var_name).dtype, paddle.float16)
 
-    def _test_base(self, exec_mode):
-        generator = paddle.fluid.unique_name.UniqueNameGenerator()
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.fluid.unique_name.guard(generator):
-            with paddle.static.scope_guard(scope):
-                with paddle.static.program_guard(main_prog, startup_prog):
-                    x = paddle.static.data(
-                        name=self.feed_list[0],
-                        shape=self.feed_shape[0],
-                        dtype='float32')
-
-                    # using fp32
-                    x = paddle.static.nn.conv2d(
-                        input=x, num_filters=3, filter_size=3)
-                    x = paddle.static.nn.batch_norm(x, act='relu')
-                    x = F.max_pool2d(x, kernel_size=2, stride=2)
-
-                    # using fp16
-                    with paddle.static.amp.fp16_guard():
-                        x = paddle.static.nn.conv2d(
-                            input=x, num_filters=6, filter_size=3)
-                        x = paddle.static.nn.batch_norm(x, act='relu')
-                        x = F.max_pool2d(x, kernel_size=2, stride=2)
-
-                    # using fp32
-                    x = paddle.static.nn.fc(x, size=10)
-                    loss = paddle.mean(x)
-
-                    # optimizer
-                    optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
-                    optimizer.minimize(loss, startup_prog)
-                    fetch_list = [loss.name]
-
-                # cast model to fp16
-                if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION:
-                    to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
-                        main_prog, self.amp_list)
-                    self.dtype_check(main_prog, to_fp16_var_names)
-
-                if exec_mode == ExecutionModeFull.CPU_FP32:
-                    place = paddle.CPUPlace()
-                else:
-                    place = paddle.IPUPlace()
-                exe = paddle.static.Executor(place)
-                exe.run(startup_prog)
-
-                # cast parameters to fp16
-                if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION:
-                    paddle.static.amp.cast_parameters_to_fp16(
-                        paddle.CPUPlace(),
-                        main_prog,
-                        to_fp16_var_names=to_fp16_var_names)
-
-                if exec_mode != ExecutionModeFull.CPU_FP32:
-                    ipu_strategy = paddle.static.IpuStrategy()
-                    ipu_strategy.set_graph_config(is_training=self.is_training)
-                    if exec_mode == ExecutionModeFull.IPU_POPART_FP16:
-                        ipu_strategy.set_precision_config(enable_fp16=True)
-                    program = paddle.static.IpuCompiledProgram(
-                        main_prog, ipu_strategy=ipu_strategy).compile(
-                            self.feed_list, fetch_list)
-                else:
-                    program = main_prog
-
-                feed = self.feed_fp32
-                result = []
-                for i in range(self.epoch):
-                    out = exe.run(program, feed=feed, fetch_list=fetch_list)
-                    result.append(out)
-                return np.array(result)
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionModeFull:
-            if mode == ExecutionModeFull.IPU_POPART_FP16:
-                continue
-            if mode > ExecutionModeFull.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+
+        # using fp32
+        x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
+        x = paddle.static.nn.batch_norm(x, act='relu')
+        x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # using fp16
+        with paddle.static.amp.fp16_guard():
+            x = paddle.static.nn.conv2d(input=x, num_filters=6, filter_size=3)
+            x = paddle.static.nn.batch_norm(x, act='relu')
+            x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # using fp32
+        x = paddle.static.nn.fc(x, size=10)
+        loss = paddle.mean(x)
+
+        # optimizer
+        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+        optimizer.minimize(loss, self.startup_prog)
+        self.fetch_list = [loss.name]
+
+    def run_model(self, exec_mode):
+        # cast model to fp16
+        if self.is_fp16_mode(exec_mode):
+            amp_list = paddle.static.amp.CustomOpLists()
+            amp_list.unsupported_list = {}
+            to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
+                self.main_prog, amp_list)
+            self.dtype_check(self.main_prog, to_fp16_var_names)
+
+        if self.is_ipu_mode(exec_mode):
+            place = paddle.CPUPlace()
+        else:
+            place = paddle.IPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+
+        # cast parameters to fp16
+        if self.is_fp16_mode(exec_mode):
+            paddle.static.amp.cast_parameters_to_fp16(
+                paddle.CPUPlace(),
+                self.main_prog,
+                to_fp16_var_names=to_fp16_var_names)
+
+        if self.is_ipu_mode(exec_mode):
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(
+                is_training=self.is_training,
+                num_ipus=self.num_ipus,
+                enable_manual_shard=self.enable_manual_shard)
+            ipu_strategy.set_pipelining_config(
+                enable_pipelining=self.enable_pipelining,
+                batches_per_step=self.batches_per_step)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        result = []
+        for _ in range(self.epoch):
+            out = exe.run(program,
+                          feed=self.feed_fp32,
+                          fetch_list=self.fetch_list)
+            result.append(out)
+        self.output_dict[exec_mode] = result
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            self.build_model()
+            self.run_model(m)
+        self.check()
+
+
+class TestPipline(TestBase):
+    @IPUOpTest.static_graph
+    def build_model(self, exec_mode):
+        feed_shape = list(self.feed_shape[0])
+        if self.is_ipu_mode(exec_mode):
+            feed_shape[0] = 1
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=feed_shape, dtype='float32')
+
+        with paddle.static.ipu_shard_guard(index=0, stage=0):
+            # using fp32
+            x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
+            x = paddle.static.nn.batch_norm(x, act='relu')
+            x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        with paddle.static.ipu_shard_guard(index=1, stage=1):
+            # using fp16
+            with paddle.static.amp.fp16_guard():
+                x = paddle.static.nn.conv2d(
+                    input=x, num_filters=6, filter_size=3)
+                x = paddle.static.nn.batch_norm(x, act='relu')
+                x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        with paddle.static.ipu_shard_guard(index=2, stage=2):
+            # using fp32
+            x = paddle.static.nn.fc(x, size=10)
+            loss = paddle.mean(x)
+
+        # optimizer
+        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+        optimizer.minimize(loss, self.startup_prog)
+        self.fetch_list = [loss.name]
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[5, 10, 27, 27])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+
+    def set_attrs(self):
+        self.num_ipus = 3
+        self.enable_pipelining = True
+        self.enable_manual_shard = True
+        self.batches_per_step = 5
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            self.build_model(m)
+            self.run_model(m)
+        # skip check results
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
index 7a9135626df79..583a8941ac62b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[2, 5])
         y = np.random.uniform(size=[5, 3])
@@ -51,63 +47,24 @@ def set_op_attrs(self):
             "y_num_col_dims": 1,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.mul(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.mul(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
new file mode 100644
index 0000000000000..a4365c021ff3c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
@@ -0,0 +1,130 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.not_equal(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase2(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.arange(0, 10).reshape([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestScalar(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = 0.5
+        self.feed_fp32 = {"x": x.astype(np.float32), }
+        self.feed_fp16 = {"x": x.astype(np.float16), }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = (x != 0.5)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
index 33a5dc888c245..938654bfafc05 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,74 +30,34 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data1 = np.array([[1], [1], [3], [0]])
-
-        self.feed = {'x': data1.astype(np.int32)}
+        self.feed_fp32 = {'x': data1.astype(np.int32)}
+        self.feed_fp16 = {'x': data1.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
     def set_op_attrs(self):
         self.attrs = {"depth": 4, "allow_out_of_range": False}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int32')
-
-                out = paddle.fluid.layers.one_hot(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int32')
+        out = paddle.fluid.layers.one_hot(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 @unittest.skip('does not support allow_out_of_range=True')
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
index 79fc9b04e1674..ec25f378866aa 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,74 +30,34 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data1 = np.array([[1], [1], [3], [0]])
-
-        self.feed = {'x': data1.astype(np.int32)}
+        self.feed_fp32 = {'x': data1.astype(np.int32)}
+        self.feed_fp16 = {'x': data1.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
     def set_op_attrs(self):
         self.attrs = {"depth": 4, "allow_out_of_range": False}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int32')
-
-                out = paddle.fluid.input.one_hot(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int32')
+        out = paddle.fluid.input.one_hot(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 @unittest.skip('does not support allow_out_of_range=True')
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
index 43f54b52b5c55..060a69e83112a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import numpy as np
 import unittest
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
index 4288b82832ede..e5df11eb4ef8c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -56,59 +52,22 @@ def set_op_attrs(self):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.pool2d(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.pool2d(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -180,5 +139,21 @@ def set_attrs(self):
         self.attrs['exclusive'] = False
 
 
+class TestAdaptive(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "pool_size": 1,
+            "pool_type": 'avg',
+            "require_index": False
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.adaptive_pool2d(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
index 911a163b8aa9c..41b2b8406dc7e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -56,59 +52,22 @@ def set_op_attrs(self):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.pool2d(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.pool2d(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -179,5 +138,21 @@ def set_op_attrs(self):
         self.attrs['exclusive'] = False
 
 
+class TestAdaptive(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "pool_size": 1,
+            "pool_type": 'max',
+            "require_index": False
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.adaptive_pool2d(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
index b3562d722c4e6..5ff1223961bb7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 2, 2])
         self.feed_fp32 = {"x": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"factor": 2.0}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.pow(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.pow(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -119,54 +78,14 @@ def set_data_feed(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                factor = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        factor = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
index c9454e5945f7d..3189e060d5837 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -30,82 +30,48 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
+    @property
+    def fp16_enabled(self):
+        return False
+
     def set_data_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
-        }
+        data = np.random.uniform(size=[1, 3, 3, 3]).astype('float32')
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = paddle.fluid.layers.conv2d(
-                    x, num_filters=3, filter_size=3)
-                out = paddle.fluid.layers.Print(out, **self.attrs)
-
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=self.feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-                return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0],
+            shape=self.feed_shape[0],
+            dtype=self.feed_dtype[0])
+        out = paddle.fluid.layers.conv2d(x, num_filters=3, filter_size=3)
+        out = paddle.fluid.layers.Print(out, **self.attrs)
+
+        if self.is_training:
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
index 929ee51b65094..93f96e08fd4b7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -28,10 +28,6 @@ def setUp(self):
         self.set_training()
         self.set_test_op()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_mean
 
@@ -40,59 +36,22 @@ def set_feed_attr(self):
         self.feed_list = list(self.feed_fp32.keys())
         self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = self.op(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = self.op(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def run_test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def run_test_base(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
     def set_data_feed0(self):
         data = np.random.uniform(size=[2, 4])
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
index 9ddf5c7537fdc..35be4d988273a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {"x": data.astype(np.float32)}
@@ -50,60 +46,23 @@ def set_op_attrs(self):
             "inplace": True,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                add = paddle.fluid.layers.elementwise_add(x, x)
-                out = paddle.fluid.layers.reshape(add, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        add = paddle.fluid.layers.elementwise_add(x, x)
+        out = paddle.fluid.layers.reshape(add, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
index 119771931701c..427e975402344 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 4, 6])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -48,59 +44,22 @@ def set_op_attrs(self):
         self.attrs['shape'] = [6, 8]
         self.attrs['inplace'] = False
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.reshape(x=x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.reshape(x=x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index ba6eb4d38bcf2..c8f0961baa480 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -14,9 +14,11 @@
 
 import tempfile
 import unittest
+from functools import partial
 
 import numpy as np
 import paddle
+import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
@@ -28,7 +30,8 @@ def setUp(self):
         self.set_atol()
         self.set_data_feed()
         self.set_feed_attr()
-        self.set_op_attrs()
+        self.set_attrs()
+        self.set_optimizer()
 
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
@@ -39,15 +42,16 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def set_op_attrs(self):
+    def set_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'sgd'
         self.attrs['enable_fp16'] = False
         self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.SGD, learning_rate=1e-1)
+
     def _test_base(self, save_otherwise_load):
         scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
@@ -71,16 +75,8 @@ def _test_base(self, save_otherwise_load):
                         name='conv2d')
                     loss = paddle.mean(conv1)
 
-                    if self.attrs['is_training']:
-                        if self.attrs['opt_type'] == 'sgd':
-                            sgd = paddle.optimizer.SGD(learning_rate=1e-2)
-                            sgd.minimize(loss)
-                        elif self.attrs['opt_type'] == 'adam':
-                            adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                            adam.minimize(loss)
-                        elif self.attrs['opt_type'] == 'lamb':
-                            lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
-                            lamb.minimize(loss)
+                    # apply optimizer
+                    self.optimizer().minimize(loss)
                     fetch_list = [loss.name]
 
                 place = paddle.IPUPlace()
@@ -91,8 +87,7 @@ def _test_base(self, save_otherwise_load):
                     paddle.static.load(main_prog, self.attrs['model_path'].name)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(
-                    is_training=self.attrs['is_training'])
+                ipu_strategy.set_graph_config(is_training=True)
                 ipu_strategy.set_precision_config(
                     enable_fp16=self.attrs['enable_fp16'])
                 ipu_program = paddle.static.IpuCompiledProgram(
@@ -131,62 +126,109 @@ def test_base(self):
         self.attrs['model_path'].cleanup()
 
 
+class TestMomentum(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Momentum, learning_rate=1e-1)
+
+
 class TestAdam(TestBase):
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'adam'
-        self.attrs['enable_fp16'] = False
-        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adam, learning_rate=1e-1)
 
 
 class TestLamb(TestBase):
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'lamb'
-        self.attrs['enable_fp16'] = False
-        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Lamb, learning_rate=1e-1)
+
+
+class TestAdamW(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.AdamW, learning_rate=1e-1)
+
+
+class TestAdamax(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adamax, learning_rate=1e-1)
+
+
+class TestAdagrad(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
+
+
+class TestAdadelta(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
+
+
+class TestRMSProp(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.RMSProp, learning_rate=1e-1)
+
+
+class TestCenteredRMSProp(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(
+            paddle.optimizer.RMSProp, learning_rate=1e-1, centered=True)
 
 
 @unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestSGDFP16(TestBase):
-    def set_op_attrs(self):
+    def set_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'sgd'
         self.attrs['enable_fp16'] = True
         self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.SGD, learning_rate=1e-1)
 
-@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
-class TestAdamFP16(TestBase):
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'adam'
-        self.attrs['enable_fp16'] = True
-        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
+class TestMomentumFp16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Momentum, learning_rate=1e-1)
 
-@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
-class TestLambFP16(TestBase):
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'lamb'
-        self.attrs['enable_fp16'] = True
-        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+class TestAdamFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adam, learning_rate=1e-1)
+
+
+class TestLambFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Lamb, learning_rate=1e-1)
+
+
+class TestAdamWFP16FP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.AdamW, learning_rate=1e-1)
+
+
+class TestAdamaxFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adamax, learning_rate=1e-1)
+
+
+class TestAdagradFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
+
+
+class TestAdadeltaFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
+
+
+class TestRMSPropFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.RMSProp, learning_rate=1e-1)
+
+
+class TestCenteredRMSPropFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(
+            paddle.optimizer.RMSProp, learning_rate=1e-1, centered=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
index 49714eba8d4d1..f28bcba4cf0d9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -51,59 +51,22 @@ def set_op_attrs(self):
             "bias_after_scale": True,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.scale(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.scale(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -155,54 +118,14 @@ def set_op_attrs(self):
             "bias_after_scale": True,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.scale(x, scale=y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.scale(x, scale=y, **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
new file mode 100644
index 0000000000000..113b316af4ea9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
@@ -0,0 +1,131 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_attrs()
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 100
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
+        self.feed_fp32 = {"image": data.astype(np.float32)}
+        self.feed_fp16 = {"image": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.0,
+            "scaled_optimizer_state": True
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        image = paddle.static.data(
+            name='image', shape=[1, 3, 10, 10], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            image, num_filters=3, filter_size=3, bias_attr=False)
+        loss = paddle.mean(conv1)
+
+        weight_decay = self.attrs['weight_decay']
+        opt = paddle.optimizer.Adam(
+            learning_rate=1e-1, weight_decay=weight_decay)
+        if self.attrs['optimizer'] == 'lamb':
+            opt = paddle.optimizer.Lamb(
+                learning_rate=1e-1, lamb_weight_decay=weight_decay)
+        opt.minimize(loss)
+        self.feed_list = [image.name]
+        self.fetch_list = [loss.name]
+
+    def run_model(self, exec_mode):
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(is_training=self.is_training)
+        if self.is_ipu_mode(exec_mode):
+            if "use_no_bias_optimizer" in self.attrs.keys():
+                ipu_strategy.set_options({
+                    "use_no_bias_optimizer": self.attrs["use_no_bias_optimizer"]
+                })
+            if "scaled_optimizer_state" in self.attrs.keys():
+                ipu_strategy.set_options({
+                    "scaled_optimizer_state":
+                    self.attrs["scaled_optimizer_state"]
+                })
+        self.run_op_test(exec_mode, ipu_strategy=ipu_strategy)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestScaledAdam(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "scaled_optimizer_state": True
+        }
+
+    def set_atol(self):
+        super().set_atol()
+        self.atol = 1e-5
+        self.rtol = 1e-5
+
+
+@unittest.skip('cpu do not support AdamNoBias')
+class TestScaledAdamNoBias(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "use_no_bias_optimizer": True,
+            "scaled_optimizer_state": True
+        }
+
+
+@unittest.skip('cpu do not support LambNoBias')
+class TestScaledLambNoBias(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.0,
+            "use_no_bias_optimizer": True,
+            "scaled_optimizer_state": True
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 6702ae4344e91..5c61012cacece 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 3e-6
         self.rtol = 1e-5
@@ -52,67 +48,32 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                conv1 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    conv1, num_filters=3, filter_size=3, bias_attr=False)
-                conv3 = paddle.static.nn.conv2d(
-                    conv2, num_filters=3, filter_size=3, bias_attr=False)
-                conv4 = paddle.static.nn.conv2d(
-                    conv3, num_filters=3, filter_size=3, bias_attr=False)
-
-            fetch_list = [conv4.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(
-                    is_training=self.is_training, micro_batch_size=2)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        conv2 = paddle.static.nn.conv2d(
+            conv1, num_filters=3, filter_size=3, bias_attr=False)
+        conv3 = paddle.static.nn.conv2d(
+            conv2, num_filters=3, filter_size=3, bias_attr=False)
+        conv4 = paddle.static.nn.conv2d(
+            conv3, num_filters=3, filter_size=3, bias_attr=False)
+        self.fetch_list = [conv4.name]
+
+    def run_model(self, exec_mode):
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(
+            is_training=self.is_training, micro_batch_size=2)
+        self.run_op_test(exec_mode, ipu_strategy)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
index 8881f018de3b5..ac8ef3e9d65ad 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[4, 5, 6])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -51,59 +47,22 @@ def set_op_attrs(self):
             "ends": [3, 2, 4],
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.slice(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.slice(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -135,54 +94,17 @@ def set_data_feed(self):
     def set_op_attrs(self):
         self.attrs = {"axes": [0, 1, 2]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                starts = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='int32')
-                ends = paddle.static.data(
-                    name=self.feed_list[2],
-                    shape=self.feed_shape[2],
-                    dtype='int32')
-                out = paddle.fluid.layers.slice(
-                    x, starts=starts, ends=ends, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        pass
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        starts = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        ends = paddle.static.data(
+            name=self.feed_list[2], shape=self.feed_shape[2], dtype='int32')
+        out = paddle.fluid.layers.slice(
+            x, starts=starts, ends=ends, **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index 25201959cecbc..0b2d776cf240b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 2, 20])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.softmax(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.softmax(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
new file mode 100644
index 0000000000000..cb1ed6ad93044
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+import paddle.nn.functional as F
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 7])
+        label = np.arange(3).reshape([3, 1])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {'soft_label': False, }
+
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        if on_ipu:
+            label = paddle.static.data(
+                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        else:
+            label = paddle.static.data(
+                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int64')
+        out = F.softmax_with_cross_entropy(x, label, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        if self.is_ipu_mode(exec_mode):
+            self.feed_fp32['label'] = self.feed_fp32['label'].astype(np.int32)
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            'soft_label': False,
+            'ignore_index': 1,
+        }
+
+
+class TestCase2(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[30, 70])
+        label = np.arange(30).reshape([30, 1])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
index 59af3a3d6ac17..63d9584dae37d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,13 +30,8 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data1 = np.random.uniform(size=[1, 3, 10, 10])
-
         self.feed_fp32 = {'x': data1.astype(np.float32)}
         self.feed_fp16 = {'x': data1.astype(np.float16)}
 
@@ -47,61 +42,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"num_or_sections": [1, 1, 1], "axis": 1}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.split(x, **self.attrs)
-
-                fetch_list = [fetch.name for fetch in out]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled
-                ) or mode == ExecutionMode.IPU_POPART_FP16:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.split(x, **self.attrs)
+        self.fetch_list = [fetch.name for fetch in out]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        for k, v in self.output_dict.items():
+            self.output_dict[k] = np.concatenate([vv.flatten() for vv in v])
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
index bdc8fb32c8472..33950221ad5e8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 1, 5])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axes": [0]}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.squeeze(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.squeeze(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
index c807ab9aab65e..11a827cee0948 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[1, 2])
         y = np.random.uniform(size=[1, 2])
@@ -57,67 +53,26 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-                z = paddle.static.data(
-                    name=self.feed_list[2],
-                    shape=self.feed_shape[2],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.stack([x, y, z], **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        z = paddle.static.data(
+            name=self.feed_list[2], shape=self.feed_shape[2], dtype='float32')
+        out = paddle.fluid.layers.stack([x, y, z], **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
index 12351cb63d6c8..fdc6ce08b6e15 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[1, 3, 2, 2])
         y = np.random.uniform(size=[1, 3, 2, 2])
@@ -48,134 +44,52 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.sum([x, y], **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.sum([x, y], **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict, check_shape=True)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
-@unittest.skip('')
 class TestCase1(TestBase):
-    def set_feed(self):
+    def set_data_feed(self):
         x = np.random.uniform(size=[1, 3, 2, 2])
         y = np.random.uniform(size=[1, 3, 2, 2])
         z = np.random.uniform(size=[1, 3, 2, 2])
         self.feed_fp32 = {
             "x": x.astype(np.float32),
             "y": y.astype(np.float32),
-            "z": y.astype(np.float32)
+            "z": z.astype(np.float32)
         }
         self.feed_fp16 = {
             "x": x.astype(np.float16),
             "y": y.astype(np.float16),
-            "z": y.astype(np.float16)
+            "z": z.astype(np.float16)
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-                z = paddle.static.data(
-                    name=self.feed_list[2],
-                    shape=self.feed_shape[2],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.sum([x, y, z], **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        z = paddle.static.data(
+            name=self.feed_list[2], shape=self.feed_shape[2], dtype='float32')
+        out = paddle.fluid.layers.sum([x, y, z], **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index ef75aee78049b..c5331d43f5e55 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +31,6 @@ def setUp(self):
         self.set_test_op()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_test_op(self):
         self.op = paddle.fluid.layers.topk
 
@@ -53,69 +49,35 @@ def set_op_attrs(self):
         if not self.use_k_as_const_variable:
             self.attrs["k"] = 3
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                if not self.use_k_as_const_variable:
-                    topk_values, topk_indices = self.op(x, **self.attrs)
-                else:
-                    # !important, popart cannot accept non const tensor
-                    K_t = paddle.fluid.layers.fill_constant(
-                        shape=[1], dtype='int32', value=self.k, name="in_2")
-                    topk_values, topk_indices = self.op(x, K_t, **self.attrs)
-
-                fetch_list = [topk_values.name, topk_indices.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result
-
-    def test_base(self):
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        if not self.use_k_as_const_variable:
+            topk_values, topk_indices = self.op(x, **self.attrs)
+        else:
+            # !important, popart cannot accept non const tensor
+            K_t = paddle.fluid.layers.fill_constant(
+                shape=[1], dtype='int32', value=self.k, name="in_2")
+            topk_values, topk_indices = self.op(x, K_t, **self.attrs)
+        self.fetch_list = [topk_values.name, topk_indices.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+
         value_dict = {}
         index_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            value, index = self._test_base(mode)
-            value_dict[mode] = value
-            index_dict[mode] = index
-
-        self.check(value_dict)
-        self.check(index_dict)
+        for k, v in self.output_dict.items():
+            value_dict[k] = v[0]
+            index_dict[k] = v[1]
+        self.check(output_dict=value_dict)
+        self.check(output_dict=index_dict)
 
 
 class TestCase2(TestTopKOp):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index 1747bde20b6a6..d5fef73a31b3e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {"x": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"perm": [0, 2, 3, 1]}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.transpose(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.transpose(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict, check_shape=True)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check(check_shape=True)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
index e068c2e3b5908..54cbc571ec6ff 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 2, 3])
         self.feed_fp32 = {"x": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axes": 0}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.unsqueeze(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.unsqueeze(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check(check_shape=True)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
index ecf1c61f52e83..30e003917efbd 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
@@ -50,72 +50,57 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int64')
-
-                with paddle.static.ipu_shard_guard(index=0, stage=0):
-                    y = paddle.fluid.layers.embedding(
-                        input=x,
-                        size=[768, 768],
-                        dtype='float32',
-                        param_attr=paddle.fluid.ParamAttr(
-                            name='word_embedding'),
-                        is_sparse=False)
-
-                with paddle.static.ipu_shard_guard(index=1, stage=1):
-                    z = paddle.fluid.layers.fc(
-                        input=y,
-                        size=768,
-                        param_attr=paddle.fluid.ParamAttr(name="fc"))
-
-                with paddle.static.ipu_shard_guard(index=0, stage=2):
-                    out = paddle.fluid.layers.matmul(
-                        x=z,
-                        y=main_prog.global_block().var('word_embedding'),
-                        transpose_y=True)
-
-            fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(
-                    num_ipus=2,
-                    is_training=self.is_training,
-                    enable_manual_shard=True)
-                ipu_strategy.set_pipelining_config(
-                    enable_pipelining=True, batches_per_step=3)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_ipu if run_ipu else self.feed_cpu
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64')
+        with paddle.static.ipu_shard_guard(index=0, stage=0):
+            y = paddle.fluid.layers.embedding(
+                input=x,
+                size=[768, 768],
+                dtype='float32',
+                param_attr=paddle.fluid.ParamAttr(name='word_embedding'),
+                is_sparse=False)
+        with paddle.static.ipu_shard_guard(index=1, stage=1):
+            z = paddle.fluid.layers.fc(
+                input=y, size=768, param_attr=paddle.fluid.ParamAttr(name="fc"))
+        with paddle.static.ipu_shard_guard(index=0, stage=2):
+            out = paddle.fluid.layers.matmul(
+                x=z,
+                y=self.main_prog.global_block().var('word_embedding'),
+                transpose_y=True)
+        self.feed_list = [x.name]
+        self.fetch_list = [out.name]
+
+    def run_model(self, run_ipu):
+        self.build_model()
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+        if run_ipu:
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(
+                num_ipus=2,
+                is_training=self.is_training,
+                enable_manual_shard=True)
+            ipu_strategy.set_pipelining_config(
+                enable_pipelining=True, batches_per_step=3)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        feed = self.feed_ipu if run_ipu else self.feed_cpu
+        result = exe.run(program, feed=feed, fetch_list=self.fetch_list)
+        return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        res0 = self.run_model(False)
+        res1 = self.run_model(True)
 
         self.assertTrue(
             np.allclose(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
new file mode 100644
index 0000000000000..828e92dc03426
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+from functools import partial
+import unittest
+
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+def product(input):
+    result = 1
+
+    for value in input:
+        result = result * value
+
+    return result
+
+
+class TestShuffleChannelMKLDNNDetectPass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        input_shape = program_config.inputs['input_data'].shape
+        first_reshape2_shape = program_config.ops[0].attrs['shape']
+        transpose2_axis = program_config.ops[1].attrs['axis']
+        second_reshape2_shape = program_config.ops[2].attrs['shape']
+
+        shape_prod = product(input_shape)
+        img_h = input_shape[-2]
+        img_w = input_shape[-1]
+
+        if shape_prod != product(first_reshape2_shape) or shape_prod != product(
+                second_reshape2_shape):
+            return False
+        if len(input_shape) != 4 or len(first_reshape2_shape) != 5 or len(
+                second_reshape2_shape) != 4:
+            return False
+        if transpose2_axis != [0, 2, 1, 3, 4]:
+            return False
+        if first_reshape2_shape[-1] != img_w or first_reshape2_shape[
+                -2] != img_h:
+            return False
+        if second_reshape2_shape[-1] != img_w or second_reshape2_shape[
+                -2] != img_h:
+            return False
+
+        return True
+
+    def sample_program_config(self, draw):
+        input_shape = draw(st.sampled_from([[128, 32, 32]]))
+        first_reshape2_shape = draw(
+            st.sampled_from([[2, 64, 32, 32], [8, 16, 32, 32]]))
+        transpose2_axis = draw(st.sampled_from([[0, 2, 1, 3, 4], [0, 2, 1, 3]]))
+        second_reshape2_shape = draw(
+            st.sampled_from([[128, 32, 32], [128, 31, 32]]))
+        batch_size = draw(st.integers(min_value=1, max_value=10))
+
+        input_shape.insert(0, batch_size)
+        first_reshape2_shape.insert(0, batch_size)
+        second_reshape2_shape.insert(0, batch_size)
+
+        def generate_input():
+            return np.random.random(input_shape).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "reshape2",
+            "op_inputs": {
+                "X": ["input_data"]
+            },
+            "op_outputs": {
+                "Out": ["first_reshape2_output"],
+                "XShape": ["first_reshape2_xshape"]
+            },
+            "op_attrs": {
+                'shape': first_reshape2_shape
+            },
+        }, {
+            "op_type": "transpose2",
+            "op_inputs": {
+                "X": ["first_reshape2_output"]
+            },
+            "op_outputs": {
+                "Out": ["transpose2_output"],
+                "XShape": ["transpose2_xshape"]
+            },
+            "op_attrs": {
+                'axis': transpose2_axis
+            },
+        }, {
+            "op_type": "reshape2",
+            "op_inputs": {
+                "X": ["transpose2_output"],
+            },
+            "op_outputs": {
+                "Out": ["output_data"],
+                "XShape": ["second_reshape2_xshape"]
+            },
+            "op_attrs": {
+                'shape': second_reshape2_shape
+            }
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["output_data"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["shuffle_channel"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["shuffle_channel_mkldnn_detect_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
new file mode 100644
index 0000000000000..f8984f5c6dfa4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
@@ -0,0 +1,273 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+SEED = 2022
+
+
+class TestDropoutOp(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64)).astype('uint8')
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestDropoutOpInput1d(TestDropoutOp):
+    # change input shape
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((3, 62)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((3, 62)).astype('uint8')
+        }
+
+
+class TestDropoutOpInput1d_1(TestDropoutOp):
+    # the input is 1-D
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((2000)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((2000)).astype('uint8')
+        }
+
+
+class TestDropoutOp2(TestDropoutOp):
+    # the dropout_prob is 1.0
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 1.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': np.zeros((32, 64)).astype('float32'),
+            'Mask': np.zeros((32, 64)).astype('uint8')
+        }
+
+
+class TestDropoutOp3(TestDropoutOp):
+    # the input dim is 3
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64, 2)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64, 2)).astype('uint8')
+        }
+
+
+@skip_check_grad_ci(reason="For inference, check_grad is not required.")
+class TestDropoutOpInference(OpTest):
+    # is_test = True
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.35,
+            'fix_seed': True,
+            'is_test': True,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {'Out': self.inputs['X']}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+@skip_check_grad_ci(reason="For inference, check_grad is not required.")
+class TestDropoutOpInference2(TestDropoutOpInference):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((32, 64, 3)).astype(self.dtype)}
+        self.attrs = {
+            'dropout_prob': 0.75,
+            'is_test': True,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {'Out': self.inputs['X']}
+
+
+class TestDropoutOpWithSeed(TestDropoutOp):
+    # the seed is a Tensor
+    def setUp(self):
+        self.op_type = "dropout"
+        self.set_mlu()
+        self.init_dtype()
+        self.inputs = {
+            "X": np.random.random((32, 64)).astype(self.dtype),
+            "Seed": np.asarray(
+                [125], dtype="int32")
+        }
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64)).astype('uint8')
+        }
+
+
+class TestDropoutOpFp16(TestDropoutOp):
+    # float16
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+
+
+class TestDropoutAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace(), paddle.device.MLUPlace(0)]
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[40, 40], dtype="float32")
+            res1 = paddle.nn.functional.dropout(
+                x=input, p=0., training=False, mode='upscale_in_train')
+            res2 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=True, mode='upscale_in_train')
+            res3 = paddle.nn.functional.dropout(
+                x=input, p=0., axis=0, training=False, mode='upscale_in_train')
+            res4 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=True,
+                mode='upscale_in_train')
+            res5 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=[0, 1],
+                training=False,
+                mode='upscale_in_train')
+            res6 = paddle.nn.functional.dropout(
+                x=input, p=1., training=True, mode='upscale_in_train')
+            res7 = paddle.fluid.layers.dropout(
+                x=input,
+                dropout_prob=0.,
+                dropout_implementation='upscale_in_train')
+            res8 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=(0, 1),
+                training=False,
+                mode='upscale_in_train')
+
+            in_np = np.random.random([40, 40]).astype("float32")
+            res_np = in_np
+            res_np2 = np.zeros_like(in_np)
+
+            exe = fluid.Executor(place)
+            res_list = [res1, res2, res3, res4, res5, res7, res8]
+            for res in res_list:
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": in_np},
+                                  fetch_list=[res])
+                self.assertTrue(np.allclose(fetches[0], res_np))
+            fetches2 = exe.run(fluid.default_main_program(),
+                               feed={"input": in_np},
+                               fetch_list=[res6])
+            self.assertTrue(np.allclose(fetches2[0], res_np2))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 738ed90b12e65..13c72bedefa8e 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
 import warnings
 import numpy as np
@@ -37,20 +38,22 @@
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable, _current_expected_place
-from paddle.fluid.tests.unittests.testsuite import (
+from paddle.fluid import unique_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)))
+from testsuite import (
     create_op,
     set_input,
     append_input_output,
     append_loss_ops, )
-from paddle.fluid import unique_name
-from paddle.fluid.tests.unittests.white_list import (
+from white_list import (
     op_accuracy_white_list,
     check_shape_white_list,
     compile_vs_runtime_white_list,
     no_check_set_white_list,
     op_threshold_white_list,
     no_grad_set_white_list, )
-from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 
 # For switch new eager mode globally
 g_is_in_eager = _in_eager_without_dygraph_check()
@@ -341,6 +344,10 @@ def is_npu_op_test():
         def is_mlu_op_test():
             return hasattr(cls, "use_mlu") and cls.use_mlu == True
 
+        def is_custom_device_op_test():
+            return hasattr(
+                cls, "use_custom_device") and cls.use_custom_device == True
+
         if not hasattr(cls, "op_type"):
             raise AssertionError(
                 "This test do not have op_type in class attrs, "
@@ -364,7 +371,8 @@ def is_mlu_op_test():
                 and not is_mkldnn_op_test() \
                 and not is_rocm_op_test() \
                 and not is_npu_op_test() \
-                and not is_mlu_op_test():
+                and not is_mlu_op_test() \
+                and not is_custom_device_op_test():
                 raise AssertionError(
                     "This test of %s op needs check_grad with fp64 precision." %
                     cls.op_type)
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
index 7aa83ad907914..3667633d3b38d 100644
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -122,6 +122,29 @@ def test_create_process_group_nccl(self):
 
             print("test allreduce min api ok")
 
+            # test allreduce prod
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            prod_result = np.multiply(x, y)
+
+            if pg.rank() == 0:
+                task = dist.all_reduce(
+                    tensor_x, dist.ReduceOp.PROD, use_calc_stream=False)
+                task.wait()
+                assert np.array_equal(tensor_x, prod_result)
+            else:
+                task = dist.all_reduce(
+                    tensor_y, dist.ReduceOp.PROD, use_calc_stream=False)
+                task.wait()
+                assert np.array_equal(tensor_y, prod_result)
+
+            print("test allreduce prod api ok")
+
             # test broadcast
             # rank 0
             x = np.random.random(self.shape).astype(self.dtype)
@@ -332,6 +355,27 @@ def test_create_process_group_nccl(self):
 
             print("test reduce min api ok")
 
+            # test reduce product
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            prod_result = np.multiply(x, y)
+
+            if pg.rank() == 0:
+                task = dist.reduce(
+                    tensor_x, 0, dist.ReduceOp.PROD, use_calc_stream=False)
+                task.wait()
+                assert np.array_equal(tensor_x, prod_result)
+            else:
+                task = dist.reduce(
+                    tensor_y, 0, dist.ReduceOp.PROD, use_calc_stream=False)
+                task.wait()
+
+            print("test reduce prod api ok")
             # test Scatter
             # rank 0
             in_shape = list(self.shape)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
new file mode 100644
index 0000000000000..f9c5d4d78c866
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
@@ -0,0 +1,193 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+from paddle.incubate.nn import FusedMultiTransformer
+import paddle.distributed.fleet as fleet
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid import core
+from paddle.nn.initializer import Constant
+
+paddle.enable_static()
+
+
+def get_param_attr(weight, bias):
+    weight_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(weight))
+    bias_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(bias))
+    return weight_attr, bias_attr
+
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+num_head = 2 * MODEL_PARALLEL_SIZE
+dim_head = 4
+hidden = num_head * dim_head
+dim_ffn = 4 * hidden
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    ln_w = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    ln_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    qkv_w = np.random.uniform(
+        -1, 1, size=(3, num_head, dim_head, hidden)).astype(DTYPE)
+    qkv_b = np.random.uniform(-1, 1, size=(3, num_head, dim_head)).astype(DTYPE)
+    linear_w = np.random.uniform(
+        -1, 1, size=(num_head * dim_head, hidden)).astype(DTYPE)
+    linear_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+
+    ffn_ln_w = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    ffn_ln_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+    ffn1_w = np.random.uniform(-1, 1, size=(hidden, dim_ffn)).astype(DTYPE)
+    ffn1_b = np.random.uniform(-1, 1, size=(dim_ffn, )).astype(DTYPE)
+    ffn2_w = np.random.uniform(-1, 1, size=(dim_ffn, hidden)).astype(DTYPE)
+    ffn2_b = np.random.uniform(-1, 1, size=(hidden, )).astype(DTYPE)
+
+    if rank is not None:
+        start = 0 if rank == 0 else (num_head // MODEL_PARALLEL_SIZE)
+        end = start + (num_head // MODEL_PARALLEL_SIZE)
+        col_qkv_w = qkv_w[:, start:end, :, :]
+        col_qkv_b = qkv_b[:, start:end, :]
+        row_linear_w = linear_w[(start * dim_head):(end * dim_head), :]
+
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
+
+        start = 0 if rank == 0 else (dim_ffn // MODEL_PARALLEL_SIZE)
+        end = start + (dim_ffn // MODEL_PARALLEL_SIZE)
+        col_ffn1_w = ffn1_w[:, start:end]
+        col_ffn1_b = ffn1_b[start:end]
+        row_ffn2_w = ffn2_w[start:end, :]
+
+        ffn_ln_w_attr, ffn_ln_b_attr = get_param_attr(ffn_ln_w, ffn_ln_b)
+        ffn1_w_attr, ffn1_b_attr = get_param_attr(col_ffn1_w, col_ffn1_b)
+        ffn2_w_attr, ffn2_b_attr = get_param_attr(row_ffn2_w, ffn2_b)
+
+        multi_transformer = FusedMultiTransformer(
+            hidden,
+            num_head,
+            dim_ffn,
+            dropout_rate=0.0,
+            activation="gelu",
+            normalize_before=True,
+            ln_scale_attrs=[ln_w_attr],
+            ln_bias_attrs=[ln_b_attr],
+            qkv_weight_attrs=[qkv_w_attr],
+            qkv_bias_attrs=[qkv_b_attr],
+            linear_weight_attrs=[linear_w_attr],
+            linear_bias_attrs=[linear_b_attr],
+            ffn_ln_scale_attrs=[ffn_ln_w_attr],
+            ffn_ln_bias_attrs=[ffn_ln_b_attr],
+            ffn1_weight_attrs=[ffn1_w_attr],
+            ffn1_bias_attrs=[ffn1_b_attr],
+            ffn2_weight_attrs=[ffn2_w_attr],
+            ffn2_bias_attrs=[ffn2_b_attr],
+            nranks=MODEL_PARALLEL_SIZE,
+            ring_id=0)
+        result = multi_transformer(data)
+    else:
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
+        linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
+
+        ffn_ln_w_attr, ffn_ln_b_attr = get_param_attr(ffn_ln_w, ffn_ln_b)
+        ffn1_w_attr, ffn1_b_attr = get_param_attr(ffn1_w, ffn1_b)
+        ffn2_w_attr, ffn2_b_attr = get_param_attr(ffn2_w, ffn2_b)
+
+        multi_transformer = FusedMultiTransformer(
+            hidden,
+            num_head,
+            dim_ffn,
+            dropout_rate=0.0,
+            activation="gelu",
+            normalize_before=True,
+            ln_scale_attrs=[ln_w_attr],
+            ln_bias_attrs=[ln_b_attr],
+            qkv_weight_attrs=[qkv_w_attr],
+            qkv_bias_attrs=[qkv_b_attr],
+            linear_weight_attrs=[linear_w_attr],
+            linear_bias_attrs=[linear_b_attr],
+            ffn_ln_scale_attrs=[ffn_ln_w_attr],
+            ffn_ln_bias_attrs=[ffn_ln_b_attr],
+            ffn1_weight_attrs=[ffn1_w_attr],
+            ffn1_bias_attrs=[ffn1_b_attr],
+            ffn2_weight_attrs=[ffn2_w_attr],
+            ffn2_bias_attrs=[ffn2_b_attr])
+        result = multi_transformer(data)
+
+    # fused_multi_transformer have no backward
+    result.stop_gradient = True
+    predict = paddle.mean(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        seq_len = 2
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, seq_len, hidden], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([seq_len, hidden]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 72240be41dd49..570551e82646f 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -52,6 +52,9 @@ def test_grad(self):
 
 
 class TestSigmoidDoubleGradCheck(unittest.TestCase):
+    def sigmoid_wrapper(self, x):
+        return fluid.layers.sigmoid(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -64,6 +67,8 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.sigmoid_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -75,6 +80,9 @@ def test_grad(self):
 
 
 class TestTanhTripleGradCheck(unittest.TestCase):
+    def tanh_wrapper(self, x):
+        return paddle.tanh(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -87,6 +95,8 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.triple_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.triple_grad_check_for_dygraph(
+            self.tanh_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -98,6 +108,9 @@ def test_grad(self):
 
 
 class TestTanhDoubleGradCheck(unittest.TestCase):
+    def tanh_wrapper(self, x):
+        return paddle.tanh(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -110,6 +123,8 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.tanh_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -146,6 +161,9 @@ def test_grad(self):
 
 
 class TestLeakyReluDoubleGradCheck(unittest.TestCase):
+    def leaky_relu_wrapper(self, x):
+        return paddle.nn.functional.leaky_relu(x[0], negative_slope=0.2)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -162,6 +180,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.leaky_relu_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -173,6 +193,9 @@ def test_grad(self):
 
 
 class TestELUDoubleGradCheck(unittest.TestCase):
+    def elu_wrapper(self, x):
+        return paddle.nn.functional.elu(x[0], alpha=0.2)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 4, 4]
@@ -189,6 +212,8 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.elu_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_pos_op.py b/python/paddle/fluid/tests/unittests/test_assign_pos_op.py
index 72924f242d211..46761063b8af2 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_pos_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_pos_op.py
@@ -24,6 +24,7 @@
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid.backward import append_backward
 from paddle.distributed.models.moe import utils
+from paddle.fluid.framework import _test_eager_guard
 
 
 def assign_pos(x, _cum_count):
@@ -117,7 +118,7 @@ def test_api_static(self):
                           fetch_list=[out])
             assert_allclose(res[0], self.out, self.cum_count)
 
-    def test_api_dygraph(self):
+    def func_api_dygraph(self):
         paddle.disable_static()
         x = paddle.to_tensor(self.x)
         cum_count = paddle.to_tensor(self.cum_count).astype(x.dtype)
@@ -125,6 +126,11 @@ def test_api_dygraph(self):
         out = utils._assign_pos(x, cum_count)
         assert_allclose(out.numpy(), self.out, self.cum_count)
 
+    def test_api_dygraph(self):
+        with _test_eager_guard():
+            self.func_api_dygraph()
+        self.func_api_dygraph()
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py
index 0470a2df35f68..d9cb0ccf48209 100644
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
@@ -235,12 +235,13 @@ def test_extremely_simple_net_with_op_in_condition(self):
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         exe = fluid.Executor(place)
-        ret = exe.run(main_program, fetch_list=[out, a.grad_name, b.grad_name])
+        ret = exe.run(main_program,
+                      fetch_list=[out, b, a.grad_name, b.grad_name])
         # Note: fill_constant has loss of precision, you have to assertEqual
         # with values doens't lose precision in float-point number.
-        self.assertEqual(ret[0][0], 1.25)
-        self.assertEqual(ret[1][0], 0.0)
-        self.assertEqual(ret[2][0], 1.0)
+        self.assertEqual(ret[0][0], ret[1][0])
+        self.assertEqual(ret[2][0], 0.0)
+        self.assertEqual(ret[3][0], 1.0)
 
 
 class TestCondNestedControlFlow(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 6a9f7a47f66cc..fdb93e1f1afdd 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -172,9 +172,9 @@ def test_check_grad_no_input(self):
 
 def create_test_cudnn_bf16_class(parent):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
-        "core is not compiled with CUDA and cudnn version need larger than 8.1.0"
-    )
+        not core.is_compiled_with_cuda() or
+        not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and do not support bfloat16")
     class TestConv2DCUDNNBF16(parent):
         def get_numeric_grad(self, place, check_name):
             scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 6976019210283..6033b809f218d 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -108,7 +108,6 @@ def test_generator_randint_dygraph(self):
 
         if core.is_compiled_with_cuda():
             print(">>>>>>> randint dygraph >>>>>>>")
-            self.assertTrue(np.allclose(x1_np, x2_np))
             self.assertTrue(np.allclose(x_np, x3_np))
 
     def test_gen_TruncatedNormal_initializer(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py b/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py
index a140bb5c79c93..7348783bd6748 100755
--- a/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py
@@ -15,12 +15,14 @@
 from __future__ import print_function
 import unittest
 import numpy as np
-
+import tempfile
+import warnings
+import json
 import paddle
 import paddle.nn as nn
 from paddle.io import Dataset, DataLoader, BatchSampler, SequenceSampler
-from paddle.fluid.reader import set_autotune_config
 import sys
+import os
 
 
 class RandomDataset(Dataset):
@@ -51,12 +53,21 @@ def setUp(self):
         self.dataset = RandomDataset(10)
 
     def test_dataloader_use_autotune(self):
-        set_autotune_config(True, 1)
+        paddle.incubate.autotune.set_config(
+            config={"dataloader": {
+                "enable": True,
+                "tuning_steps": 1,
+            }})
         loader = DataLoader(
             self.dataset, batch_size=self.batch_size, num_workers=0)
 
     def test_dataloader_disable_autotune(self):
-        set_autotune_config(False)
+        config = {"dataloader": {"enable": False, "tuning_steps": 1}}
+        tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+        json.dump(config, tfile)
+        tfile.close()
+        paddle.incubate.autotune.set_config(tfile.name)
+        os.remove(tfile.name)
         loader = DataLoader(
             self.dataset, batch_size=self.batch_size, num_workers=2)
         if (sys.platform == 'darwin' or sys.platform == 'win32'):
@@ -65,12 +76,28 @@ def test_dataloader_disable_autotune(self):
             self.assertEqual(loader.num_workers, 2)
 
     def test_distributer_batch_sampler_autotune(self):
-        set_autotune_config(True, 1)
+        paddle.incubate.autotune.set_config(
+            config={"dataloader": {
+                "enable": True,
+                "tuning_steps": 1,
+            }})
         batch_sampler = paddle.io.DistributedBatchSampler(
             self.dataset, batch_size=self.batch_size)
         loader = DataLoader(
             self.dataset, batch_sampler=batch_sampler, num_workers=2)
 
 
+class TestAutoTuneAPI(unittest.TestCase):
+    def test_set_config_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            config = {"kernel": {"enable": 1, "tuning_range": True}}
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
+            self.assertTrue(len(w) == 2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 6a32a68db1be8..348945b73e1a4 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -481,11 +481,9 @@ def test_in_memory_dataset_run_2(self):
         dataset._set_fleet_send_sleep_seconds(2)
         dataset.preload_into_memory()
         dataset.wait_preload_done()
-        dataset.release_memory()
         dataset.preload_into_memory(1)
         dataset.wait_preload_done()
         dataset.dataset.merge_by_lineid()
-        dataset.release_memory()
         dataset._set_merge_by_lineid(30)
         dataset._set_parse_ins_id(False)
         dataset.load_into_memory()
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
index 99a46bfd9584d..b435975452009 100644
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -19,8 +19,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDiffOp(unittest.TestCase):
@@ -55,7 +54,7 @@ def setUp(self):
         if core.is_compiled_with_cuda():
             self.places.append(paddle.CUDAPlace(0))
 
-    def test_dygraph(self):
+    def func_dygraph(self):
         for place in self.places:
             paddle.disable_static()
             x = paddle.to_tensor(self.input, place=place)
@@ -71,6 +70,13 @@ def test_dygraph(self):
                 append=self.append)
             self.assertTrue((out.numpy() == self.output).all(), True)
 
+    def test_dygraph(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_dygraph()
+        self.setUp()
+        self.func_dygraph()
+
     def test_static(self):
         paddle.enable_static()
         places = [fluid.CPUPlace()]
@@ -110,7 +116,7 @@ def test_static(self):
                                   fetch_list=[out])
                 self.assertTrue((fetches[0] == self.output).all(), True)
 
-    def test_grad(self):
+    def func_grad(self):
         for place in self.places:
             x = paddle.to_tensor(self.input, place=place, stop_gradient=False)
             if self.prepend is not None:
@@ -129,6 +135,13 @@ def test_grad(self):
             except:
                 raise RuntimeError("Check Diff Gradient Failed")
 
+    def test_grad(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_grad()
+        self.setUp()
+        self.func_grad()
+
 
 class TestDiffOpAxis(TestDiffOp):
     def set_args(self):
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
index af99529adfa74..315580dd31ad7 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
@@ -34,7 +34,9 @@ def remove_file_if_exists(file_name):
         shutil.rmtree(file_name)
 
 
-def run_test(clip_after_allreduce=True, max_global_norm=-1.0):
+def run_test(clip_after_allreduce=True,
+             max_global_norm=-1.0,
+             gradient_merge_steps=1):
     if not paddle.is_compiled_with_cuda():
         return
     if os.name == 'nt':
@@ -55,6 +57,7 @@ def run_test(clip_after_allreduce=True, max_global_norm=-1.0):
 
     os.environ['CLIP_AFTER_ALLREDUCE'] = str(clip_after_allreduce)
     os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
+    os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
 
     touch_file_env = 'SUCCESS_TOUCH_FILE'
     touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid())
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
new file mode 100644
index 0000000000000..1822b77d0d0e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_distributed_fused_lamb_op_with_clip import run_test
+import unittest
+
+
+class TestDistributedFusedLambGradientMerge(unittest.TestCase):
+    def test_gm(self):
+        run_test(
+            clip_after_allreduce=True,
+            max_global_norm=-1.0,
+            gradient_merge_steps=2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 20abeaec7268c..e8d4fc260b87a 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -23,7 +23,6 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import _test_eager_guard, _enable_legacy_dygraph
-_enable_legacy_dygraph()
 import os
 
 from paddle import _C_ops
@@ -979,6 +978,7 @@ def test_backward_downscale_in_infer_eager(self):
                         ), self.cal_grad_downscale_in_infer(mask.numpy())))
 
     def test_backward_upscale_train(self):
+        _enable_legacy_dygraph()
         for place in self.places:
             with fluid.dygraph.guard(place):
 
@@ -1010,6 +1010,7 @@ def test_backward_upscale_train_eager(self):
                         ), self.cal_grad_upscale_train(mask.numpy(), prob)))
 
     def test_backward_upscale_train_2(self):
+        _enable_legacy_dygraph()
         for place in self.places:
             with fluid.dygraph.guard(place):
 
@@ -1025,6 +1026,23 @@ def test_backward_upscale_train_2(self):
                     np.allclose(input.gradient(
                     ), self.cal_grad_upscale_train(mask.numpy(), prob)))
 
+    def test_backward_upscale_train_2_eager(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                with _test_eager_guard():
+
+                    prob = 0.3
+                    input = paddle.uniform([40, 40], dtype="float32")
+                    input.stop_gradient = False
+                    out, mask = _C_ops.final_state_dropout(
+                        input, None, 0.3, False, "upscale_in_train", 0, False)
+
+                    out.backward()
+
+                    self.assertTrue(
+                        np.allclose(input.gradient(
+                        ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
 
 class TestRandomValue(unittest.TestCase):
     def test_fixed_random_number(self):
diff --git a/python/paddle/fluid/tests/unittests/test_eigh_op.py b/python/paddle/fluid/tests/unittests/test_eigh_op.py
index 9c9cd883313a2..2abbcc98a6b7e 100644
--- a/python/paddle/fluid/tests/unittests/test_eigh_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigh_op.py
@@ -19,8 +19,6 @@
 import paddle
 from op_test import OpTest
 from gradient_checker import grad_check
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
 
 
 def valid_eigh_result(A, eigh_value, eigh_vector, uplo):
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py
new file mode 100644
index 0000000000000..565e43214ea32
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+from op_test import OpTest
+
+
+class TestEinsumBinary(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "einsum"
+        self.disable = False
+        self.set_mandatory()
+        self.init_input()
+        np.random.seed(123)
+        out = np.einsum(self.equation, *self.inputs)
+        self.operands = []
+        for idx, inp in enumerate(self.inputs):
+            self.operands.append(("x" + str(idx), inp))
+        self.inputs = {"Operands": self.operands}
+        self.attrs = {"equation": self.equation}
+        self.outputs = {'Out': out}
+
+    def init_input(self):
+        self.inputs = []
+        for t, s in zip(self.types, self.shapes):
+            self.inputs.append(np.random.random(s).astype(t))
+
+    def set_mandatory(self):
+        self.disable = False
+        self.shapes = [(10, 10, 20), (20, 6)]
+        self.types = [np.float64, np.float64]
+        self.equation = "mij,jk->ki"
+
+    def test_check_output(self):
+        if not self.disable:
+            self.check_output()
+
+    def test_grad(self):
+        if not self.disable:
+            self.check_grad([op[0] for op in self.operands], ["Out"])
+
+
+class TestEinsum1(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(20, 3, 3), (20, 3, 3)]
+        self.types = [np.float64, np.float64]
+        self.equation = "mij,mjk->mik"
+
+
+class TestEinsum2(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(20, 3, 3), (20, 3, 3)]
+        self.types = [np.float64, np.float64]
+        self.equation = "mij,mjk->ikm"
+
+
+class TestEinsum3(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(10, 10), (10, 10)]
+        self.types = [np.float64, np.float64]
+        self.equation = "ij,jk->ik"  # }}}
+
+
+class TestEinsumWithReduction(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(10, 3, 5), (5, 30)]
+        self.types = [np.float64, np.float64]
+        self.equation = "ijk,kl->jl"
+
+
+class TestEinsumWithReduction1(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(10, 3, 3, 5), (10, 5, 10, 10)]
+        self.types = [np.float64, np.float64]
+        self.equation = "mijk,mklh->ljm"
+
+
+class TestEinsumWithUnary(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(10, 10, 3, 5)]
+        self.types = [np.float64]
+        self.equation = "mijk->mi"
+
+
+class TestEinsumWithUnary1(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(5, 10, 3, 3), (3, 6, 3, 10)]
+        self.types = [np.float64, np.float64]
+        self.equation = "imjl,jklm->imk"
+
+
+class TestEinsumWithBroadcast1(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(5, 10, 3, 3)]
+        self.types = [np.float64]
+        self.equation = "i...->..."
+
+
+class TestEinsumWithBroadcast2(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(10, 11), (3, 4, 5, 10)]
+        self.types = [np.float64, np.float64]
+        self.equation = "...ij,...i->j..."
+
+
+class TestEinsumWithBroadcast3(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(10, 3, 2, 3, 4), (12, 10)]
+        self.types = [np.float64, np.float64]
+        self.equation = "k...,...jk->...k"
+
+
+class TestEinsumWithBroadcast4(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(10, 3, 2, 3, 4), (12, 10)]
+        self.types = [np.float64, np.float64]
+        self.equation = "a...d,...cb->...abcd"
+
+
+class TestEinsumWithBroadcast5(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(3, 2, 2, 10), (10, 3, 2, 2)]
+        self.types = [np.float64, np.float64]
+        self.equation = "...a,a...->..."
+
+
+class TestEinsumWithBroadcast6(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(100), (100)]
+        self.types = [np.float64, np.float64]
+        self.equation = "i,i->"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
new file mode 100644
index 0000000000000..63acaf6396913
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -0,0 +1,468 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import contextlib
+import unittest
+import paddle
+from paddle.fluid import core
+
+import os
+os.environ['FLAGS_new_einsum'] = "1"
+
+
+def error_trans(func, *args, **kargs):
+    """ 
+    transport C++ exception into Python exception. 
+    because einsum_v2 raise different exception with einsum_v1.
+    """
+    try:
+        out = func(*args, **kargs)
+    except ValueError as e:
+        if "Same label have different shapes" in str(e):
+            raise AssertionError("Invalid operands: label i "
+                                 "corresponds to non-broadcastable dimensions.")
+
+
+class TestErrors(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_diagonalize_errors(self):
+        a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float')
+        a = paddle.to_tensor(a)
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
+            paddle.einsum('...ii->...i', a)
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
+            paddle.einsum('i...i', a)
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
+            paddle.einsum('i...i->i...', a)
+
+    def test_param_errors(self):
+        a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float')
+        a = paddle.to_tensor(a)
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Required at least one operand in Einsum API, but received 0 ")):
+            paddle.einsum('ijk')
+        with self.assertRaisesRegex(AssertionError, (
+                'Invalid equation: multiple `->` were found.')):
+            paddle.einsum('i -> j -> k', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the number of operands is 2, "
+                "but found 3 segments in the label equation.")):
+            paddle.einsum('i,j,k', a, a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the number of operands is 2, "
+                "but found 1 segments in the label equation.")):
+            paddle.einsum('ij -> k', a, a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the number of operands is 1, "
+                "but found 2 segments in the label equation.")):
+            paddle.einsum('i, -> k', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the label string '' misses dimensions.")):
+            paddle.einsum('->', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the label string 'i' misses dimensions.")):
+            paddle.einsum('i', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: _ is not a valid label, "
+                "which should be letters.")):
+            paddle.einsum('i_', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: `.` is found outside of an ellipsis.")):
+            paddle.einsum('i..j', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: `.` is found outside of an ellipsis.")):
+            paddle.einsum('...k...', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: missing ellipsis in output labels.")):
+            paddle.einsum('i...->i', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: duplicate output labels are found.")):
+            paddle.einsum('i...->i...i', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid operands: label i "
+                "corresponds to non-broadcastable dimensions.")):
+            error_trans(paddle.einsum, 'ij...,ji...', a, a)
+
+
+class TestEinsum(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        np.random.seed(12345)
+
+        cls.TEST_SAMPLES = {
+            "a": np.random.rand(1, 1),
+            "b": np.random.rand(1),
+            "x": np.random.rand(5),
+            "y": np.random.rand(7),
+            "A": np.random.rand(4, 5),
+            "B": np.random.rand(2, 5),
+            "C": np.random.rand(3, 7),
+            "D": np.random.rand(3, 4, 5),
+            "E": np.random.rand(3, 5, 2),
+            "F": np.random.rand(2, 4, 5, 3),
+            "G": np.random.rand(4, 2, 5),
+            "H": np.random.rand(3, 2, 4),
+            "I": np.random.rand(2, 2),
+            "J": np.random.rand(1, 3, 5),
+            "K": np.random.rand(1, 2, 3, 4),
+        }
+
+    def _get_place(self, force_to_use_cpu=False):
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            return core.CPUPlace()
+
+    def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8):
+        error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}'
+        self.assertTrue(
+            np.allclose(
+                actual, expect, rtol=rtol, atol=atol),
+            error_msg.format(paddle.get_device(), expect, actual,
+                             self.__class__.__name__))
+
+    def setUp(self):
+        self.sample = {"paradigm": "i->", "data": ["x"]}
+
+    def test_forward(self):
+        operands = [
+            TestEinsum.TEST_SAMPLES[operand] for operand in self.sample["data"]
+        ]
+        expected_result = np.einsum(self.sample["paradigm"], *operands)
+        equation = self.sample["paradigm"]
+
+        with paddle.fluid.dygraph.guard(
+                self._get_place(force_to_use_cpu=False)):
+            pd_operands = [paddle.to_tensor(operand) for operand in operands]
+            result = paddle.einsum(equation, *pd_operands)
+            self.check_output_equal(result.numpy(), expected_result)
+
+        with paddle.fluid.dygraph.guard(self._get_place(force_to_use_cpu=True)):
+            pd_operands = [paddle.to_tensor(operand) for operand in operands]
+            result = paddle.einsum(equation, *pd_operands)
+            self.check_output_equal(result.numpy(), expected_result)
+
+
+class TestEinsumVectorDot(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i,i->", "data": ["x", "x"]}
+
+
+class TestEinsumVectorMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i,i->i", "data": ["x", "x"]}
+
+
+class TestEinsumVectorOuter(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i,j->ij", "data": ["x", "y"]}
+
+
+class TestEinsumMatrixTranspose(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij->ji", "data": ["A"]}
+
+
+class TestEinsumMatrixRowSum(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij->j", "data": ["A"]}
+
+
+class TestEinsumMatrixColSum(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij->i", "data": ["A"]}
+
+
+class TestEinsumMatrixEleMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,ij->ij", "data": ["A", "A"]}
+
+
+class TestEinsumDegenerateMatrixVecMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,j", "data": ["a", "b"]}
+
+
+class TestEinsumMatrixVecMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,j->i", "data": ["A", "x"]}
+
+
+class TestEinsumMatrixMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,kj->ik", "data": ["A", "B"]}
+
+
+class TestEinsumMatrixOuter(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,kl->ijkl", "data": ["A", "C"]}
+
+
+class TestEinsumTensorBMM(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "bij,bjk->bik", "data": ["D", "E"]}
+
+
+class TestEinsumTensorContract1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,jk->i", "data": ["D", "A"]}
+
+
+class TestEinsumTensorContract2(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,lk->ijl", "data": ["D", "B"]}
+
+
+class TestEinsumTensorContract3(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "abcd,dfg->abcfg", "data": ["F", "D"]}
+
+
+class TestEinsumTensorContract4(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,jk->ik", "data": ["D", "A"]}
+
+
+class TestEinsumTensorContract5(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,jk->ij", "data": ["D", "A"]}
+
+
+class TestEinsumTensorContract6(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ik, ijk->j", "data": ["A", "G"]}
+
+
+class TestEinsumTensorContract7(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk, ik->jk", "data": ["G", "A"]}
+
+
+class TestEinsumEllipsis1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i...->...", "data": ["G"]}
+
+
+class TestEinsumEllipsis2(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,...i->j...", "data": ["A", "H"]}
+
+
+class TestEinsumEllipsis3(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "k...,jk", "data": ["F", "I"]}
+
+
+class TestEinsumTestEinsumBilinear(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "bn,anm,bm->ba", "data": ["B", "E", "I"]}
+
+
+class TestEinsumTestEinsumOthers1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijkl, lmn->kmn", "data": ["F", "H"]}
+
+
+class TestEinsumTestEinsumOthers2(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijkl, lmn->ijn", "data": ["F", "H"]}
+
+
+class TestEinsumBatch1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "blq,bhlk->bhlqk", "data": ["J", "K"]}
+
+
+class TestNumpyTests(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def _get_place(self, force_to_use_cpu=False):
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            return core.CPUPlace()
+
+    def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8):
+        error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}'
+        self.assertTrue(
+            np.allclose(
+                actual, expect, rtol=rtol, atol=atol),
+            error_msg.format(paddle.get_device(), expect, actual,
+                             self.__class__.__name__))
+
+    def check_output(self, eqn, *ops):
+        expect = np.einsum(eqn, *ops)
+        with paddle.fluid.dygraph.guard(
+                self._get_place(force_to_use_cpu=False)):
+            pd_operands = [paddle.to_tensor(op) for op in ops]
+            actual = paddle.einsum(eqn, *pd_operands)
+            self.check_output_equal(actual.numpy(), expect)
+
+    def test_sums(self):
+        for n in range(1, 17):
+            a = np.arange(n).astype('float')
+            self.check_output("i->", a)
+
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float')
+            self.check_output("...i->...", a)
+
+        for n in range(1, 17):
+            a = np.arange(2 * n).reshape(2, n).astype('float')
+            self.check_output("i...->...", a)
+
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float')
+            self.check_output("i...->...", a)
+
+        for n in range(1, 17):
+            a = np.arange(3 * n).reshape(3, n).astype('float')
+            b = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float')
+            self.check_output("..., ...", a, b)
+
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float')
+            b = np.arange(n).astype('float')
+            self.check_output("...i, ...i", a, b)
+
+        for n in range(1, 11):
+            a = np.arange(n * 3 * 2).reshape(n, 3, 2).astype('float')
+            b = np.arange(n).astype('float')
+            self.check_output("i..., i...", a, b)
+
+        for n in range(1, 17):
+            a = (np.arange(3) + 1).astype('float')
+            b = (np.arange(n) + 1).astype('float')
+            self.check_output("i,j", a, b)
+
+        for n in range(1, 17):
+            a = np.arange(4 * n).reshape(4, n).astype('float')
+            b = np.arange(n).astype('float')
+            self.check_output("ij, j", a, b)
+
+        for n in range(1, 17):
+            a = np.arange(4 * n).reshape(4, n).astype('float')
+            b = np.arange(n).astype('float')
+            self.check_output("ji,j", a.T, b.T)
+
+        for n in range(1, 17):
+            a = np.arange(4 * n).reshape(4, n).astype('float')
+            b = np.arange(n * 6).reshape(n, 6).astype('float')
+            self.check_output("ij,jk", a, b)
+
+        a = np.arange(12).reshape(3, 4).astype('float')
+        b = np.arange(20).reshape(4, 5).astype('float')
+        c = np.arange(30).reshape(5, 6).astype('float')
+        self.check_output("ij,jk,kl", a, b, c)
+
+        a = np.arange(60).reshape(3, 4, 5).astype('float')
+        b = np.arange(24).reshape(4, 3, 2).astype('float')
+        self.check_output("ijk, jil -> kl", a, b)
+
+        for n in range(1, 25):
+            a = np.arange(n).astype('float')
+            self.check_output("...,...", a, a)
+            self.check_output("i,i", a, a)
+
+        # TODO(@xiongkun): explict broadcast in EinsumOp is not supported, it's not recommend to use einsum like this.
+        #p = np.ones((10, 2)).astype('float')
+        #q = np.ones((1, 2)).astype('float')
+        #self.check_output('ij,ij->j', p, q)
+
+        # TODO(@xiongkun): explict-label-broadcast in EinsumOp is not supported, it's not recommend to use einsum like this.
+        #x = np.array([2., 3.]).astype('float')
+        #y = np.array([4.]).astype('float')
+        #self.check_output("i, i", x, y)
+
+        # TODO(@xiongkun): explict-label-broadcast in EinsumOp is not supported, it's not recommend to use einsum like this.
+        #p = np.ones((1, 5)) / 2
+        #q = np.ones((5, 5)) / 2
+        #self.check_output("...ij,...jk->...ik", p, p)
+        #self.check_output("...ij,...jk->...ik", p, q)
+
+        x = np.eye(2).astype('float')
+        y = np.ones(2).astype('float')
+        self.check_output("ji,i->", x, y)
+        self.check_output("i,ij->", y, x)
+        self.check_output("ij,i->", x, y)
+
+    def test_large_nops(self):
+        pass
+        # TODO(@xiongkun): explict broadcast in EinsumOp is not supported, it's not recommend to use einsum like this.
+        #a = np.arange(4 * 3 * 1 * 4).reshape(4, 3, 1, 4).astype('float')
+        #self.check_output('a...b,b...c,c...d', a, a, a)
+        #self.check_output('a...b,b...c,c...a', a, a, a)
+        #self.check_output('a...b,b...c,c...a', a, a, a)
+        #self.check_output('...ab,...ba,...ab,...ab', a, a, a, a)
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        fluid = paddle.fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.place = fluid.CUDAPlace(0)
+        else:
+            self.place = fluid.CPUPlace()
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            a = paddle.static.data(
+                name='a', shape=[3, None, None, None], dtype='float')
+            b = paddle.static.data(
+                name='b', shape=[2, None, None, None], dtype='float')
+            c = paddle.static.data(
+                name='c', shape=[None, None, 2, None], dtype='float')
+            d = paddle.static.data(
+                name='d', shape=[None, None, 5], dtype='float')
+            e = paddle.static.data(
+                name='e', shape=[None, 2, None], dtype='float')
+
+            outs = []
+            outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b))
+            outs.append(paddle.einsum('...ik, ...j', c, d))
+            outs.append(paddle.einsum('...kj, ...ik', d, e))
+            outs.append(paddle.einsum('ijk..., ikj', c, e))
+            outs.append(paddle.einsum('ijk..., ikj->...ij', c, e))
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        a = np.arange(72).reshape(3, 2, 3, 4).astype('float')
+        b = np.arange(48).reshape(2, 2, 3, 4).astype('float')
+        c = np.arange(48).reshape(2, 3, 2, 4).astype('float')
+        d = np.arange(30).reshape(2, 3, 5).astype('float')
+        e = np.arange(12).reshape(2, 2, 3).astype('float')
+        feeds = {'a': a, 'b': b, 'c': c, 'd': d, 'e': e}
+        actual = exe.run(main, feed=feeds, fetch_list=[outs])
+        expect = []
+        expect.append(np.einsum("ibnd,jbnd->bnij", a, b))
+        expect.append(np.einsum('...ik, ...j', c, d))
+        expect.append(np.einsum('...kj, ...ik', d, e))
+        expect.append(np.einsum('ijk..., ikj', c, e))
+        expect.append(np.einsum('ijk..., ikj->...ij', c, e))
+        for a, e in zip(actual, expect):
+            self.check_output_equal(a, e)
+
+
+if __name__ == "__main__":
+    u
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index d50241e58dea3..27dbd3752b550 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -60,9 +60,9 @@ def init_dtype(self):
         pass
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
-    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+@unittest.skipIf(not core.is_compiled_with_cuda() or
+                 not core.is_bfloat16_supported(core.CUDAPlace(0)),
+                 "core is not compiled with CUDA and not support the bfloat16")
 class TestElementwiseDivOpBF16(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
index 8f6f9851c7006..ccfed61185f0c 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
@@ -139,6 +139,9 @@ def test_grad(self):
 
 
 class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
+    def subtract_wrapper(self, x):
+        return paddle.subtract(x[0], x[1])
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -156,6 +159,11 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.subtract_wrapper, [x, y],
+            out,
+            x_init=[x_arr, y_arr],
+            place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -195,6 +203,9 @@ def test_grad(self):
 
 
 class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
+    def divide_wrapper(self, x):
+        return paddle.divide(x[0], x[1])
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -213,6 +224,12 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.divide_wrapper, [x, y],
+            out,
+            x_init=[x_arr, y_arr],
+            place=place,
+            atol=1e-3)
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
new file mode 100644
index 0000000000000..8f77972de8656
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -0,0 +1,542 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+import paddle.incubate.nn.functional as incubate_f
+from paddle.nn.layer.norm import LayerNorm
+from paddle.nn.layer.common import Linear, Dropout
+from paddle.nn.layer.transformer import _convert_attention_mask
+from paddle import tensor
+from paddle.fluid import layers
+import unittest
+from op_test import OpTest
+from paddle.fluid.framework import default_main_program
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.nn.initializer import Constant
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.framework import _non_static_mode, default_main_program
+from paddle import _C_ops
+from paddle.incubate.nn.functional import fused_multi_transformer
+
+default_main_program().random_seed = 42
+
+
+class TestFusedMultiTransformerOp(OpTest):
+    def setUp(self):
+        self.config()
+        self.generate_input_data()
+
+        self.rtol = 1e-5
+        # FIXME(wangxi): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
+        paddle.set_default_dtype(self.x_type)
+        self.__class__.op_type = "fused_multi_transformer"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = False
+
+        bias_attr = paddle.fluid.ParamAttr(
+            initializer=paddle.fluid.initializer.Constant(value=0.0005))
+        self.q_proj = Linear(
+            self.embed_dim,
+            self.embed_dim,
+            self.weight_attr,
+            bias_attr=bias_attr)
+        #bias_attr=self.bias_attr)
+
+        self.k_proj = Linear(
+            self.kdim,
+            self.embed_dim,
+            self.weight_attr,
+            bias_attr=self.bias_attr)
+        self.v_proj = Linear(
+            self.vdim,
+            self.embed_dim,
+            self.weight_attr,
+            bias_attr=self.bias_attr)
+        self.out_proj = Linear(
+            self.embed_dim,
+            self.embed_dim,
+            self.weight_attr,
+            bias_attr=self.bias_attr)
+
+        self.ffn1_proj = Linear(
+            self.embed_dim,
+            4 * self.embed_dim,
+            self.weight_attr,
+            bias_attr=self.bias_attr)
+        self.ffn2_proj = Linear(
+            4 * self.embed_dim,
+            self.embed_dim,
+            self.weight_attr,
+            bias_attr=self.bias_attr)
+
+        paddle.set_default_dtype(np.float32)
+        self.norm = LayerNorm(self.embed_dim)
+        self.ffn_norm = LayerNorm(self.embed_dim)
+
+        paddle.set_default_dtype(self.x_type)
+        self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
+        self.activation = getattr(F, self.act_method)
+
+    def config(self):
+        # for debug
+        self.debug = False
+
+        self.x_type = np.float32
+        self.attn_mask_type = np.float64
+        self.pre_layer_norm = True
+        self.has_attn_mask = True
+
+        # has_cache_kv, gen_cache_kv, stage
+        # False,        False,        not generation
+        # True,         True,         generation context stage
+        # True,         False,        generation decoder stage
+        self.has_cache_kv = False
+        self.gen_cache_kv = False
+
+        self.training = False
+
+        self.layers = 4
+        self.batch_size = 8
+        self.query_length = 128
+        self.cache_length = 128
+        self.head_dim = 64
+        self.num_heads = 16
+        self.embed_dim = self.head_dim * self.num_heads
+
+        self.dropout_prob = 0.0
+        self.attn_dropout_prob = 0.0
+        self.act_method = 'gelu'
+        self.weight_attr = None
+        self.bias_attr = None
+        self.kdim, self.vdim = self.embed_dim, self.embed_dim
+        self.key_length, self.value_length = self.query_length, self.query_length
+
+    def generate_input_data(self):
+        self.query = np.random.rand(self.batch_size, self.query_length,
+                                    self.embed_dim).astype(self.x_type)
+        out_seq_len = self.key_length
+        if self.has_cache_kv:
+            assert self.training is False, ValueError(
+                'cache_kv can only used in inference')
+            self.cache_kv = np.random.rand(2, self.batch_size, self.num_heads,
+                                           self.cache_length,
+                                           self.head_dim).astype(self.x_type)
+            if self.gen_cache_kv:
+                self.cache_kv[:] = 0
+            else:
+                out_seq_len += self.cache_length
+        else:
+            self.cache_kv = None
+
+        if self.has_attn_mask:
+            # [B, n_head, seq_len, out_seq_len]
+            self.attn_mask = np.ones(
+                (self.batch_size, 1, self.query_length, out_seq_len),
+                dtype=self.attn_mask_type)
+            if self.attn_mask_type == np.int64:
+                self.attn_mask = np.tril(self.attn_mask)
+            elif self.attn_mask_type == np.float64:
+                if self.has_cache_kv and not self.gen_cache_kv:
+                    # NOTE: decoder stage, -1(out_seq_len) should no mask
+                    self.attn_mask[:, :, :, -2] = 0.0
+                    self.attn_mask = (self.attn_mask - 1.0) * 1e4
+                else:
+                    self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e4
+            else:
+                raise ValueError(
+                    "'attn_mask_type' should be 'int64' or 'float64'.")
+        else:
+            self.attn_mask = None
+        self.key, self.value = self.query, self.query
+
+        self.dout = np.random.random((self.batch_size, self.query_length,
+                                      self.embed_dim)).astype(self.x_type)
+
+    def GetBaselineOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        tensor_query = paddle.to_tensor(self.query, stop_gradient=False)
+
+        cache_kvs = []
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
+
+        if self.has_attn_mask:
+            attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
+        else:
+            attn_mask = None
+
+        for i in range(self.layers):
+            residual = tensor_query
+            ln1_out = tensor_query
+            if self.pre_layer_norm:
+                ln1_out = self.norm(tensor_query)
+
+            q = self.q_proj(ln1_out)
+            q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+            q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3])
+            k = self.k_proj(ln1_out)
+            v = self.v_proj(ln1_out)
+            k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+            k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3])
+            v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+            v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3])
+
+            if self.has_cache_kv:
+                # [1, B, n_head, cache_seq_len, head_dim]
+                cache_k, cache_v = paddle.split(cache_kv, 2)
+                cache_k = paddle.squeeze(cache_k, axis=0)
+                cache_v = paddle.squeeze(cache_v, axis=0)
+                # [B, n_head, cache_seq_len + seq_len, head_dim]
+                # out_seq_len = cache_seq_len + seq_len
+                if self.debug:
+                    print('q out is')
+                    print(q_out[0, 0, :, :])
+                    print('cache k out seq=128')
+                    print(k_out[0, 0, :, :])
+                if self.gen_cache_kv:
+                    cache_kvs.append((k_out, v_out))
+                else:
+                    k_out = paddle.concat([cache_k, k_out], axis=-2)
+                    v_out = paddle.concat([cache_v, v_out], axis=-2)
+
+            # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
+            # --> [B, n_head, seq_len, out_seq_len]
+            qk_out = layers.matmul(
+                x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5)
+
+            if self.debug:
+                print('qk out is')
+                print(qk_out[0][0][0])
+
+            if attn_mask is not None:
+                attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype)
+                attn_mask_out = qk_out + attn_mask
+                if self.debug:
+                    print('attn mask out is')
+                    print(attn_mask_out[0][0][0])
+                softmax_out = F.softmax(attn_mask_out)
+            else:
+                softmax_out = F.softmax(qk_out)
+
+            if self.debug:
+                print('softmax out is')
+                print(softmax_out[0][0][0])
+            if self.dropout_prob:
+                dropout_out = F.dropout(
+                    softmax_out,
+                    self.dropout_prob,
+                    training=self.training,
+                    mode="upscale_in_train")
+                # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
+                # --> [B, n_head, seq_len, head_dim]
+                qktv_out = tensor.matmul(dropout_out, v_out)
+            else:
+                qktv_out = tensor.matmul(softmax_out, v_out)
+
+            fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3])
+            if self.debug:
+                print('fmha out is')
+                print(fmha_out[0][0][0])
+            out_linear_in = tensor.reshape(
+                x=fmha_out,
+                shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]])
+            out = self.out_proj(out_linear_in)
+
+            residual_out = residual + self.dropout(out)
+            if not self.pre_layer_norm:
+                attn_out = self.norm(residual_out)
+            else:
+                attn_out = residual_out
+
+            ffn_ln_out = attn_out
+            if self.pre_layer_norm:
+                ffn_ln_out = self.ffn_norm(attn_out)
+
+            ffn1_out = self.ffn1_proj(ffn_ln_out)
+            ffn1_out = self.dropout(self.activation(ffn1_out))
+            ffn2_out = self.ffn2_proj(ffn1_out)
+
+            residual_out = attn_out + self.dropout(ffn2_out)
+            final_out = residual_out
+            if not self.pre_layer_norm:
+                final_out = self.ffn_norm(residual_out)
+
+            tensor_query = final_out
+
+        if self.has_cache_kv and self.gen_cache_kv:
+            return final_out, cache_kvs
+        return final_out
+
+    def GetFusedMultiTransformerOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        q_proj_weight = paddle.to_tensor(
+            self.q_proj.weight, stop_gradient=False)
+        k_proj_weight = paddle.to_tensor(
+            self.k_proj.weight, stop_gradient=False)
+        v_proj_weight = paddle.to_tensor(
+            self.v_proj.weight, stop_gradient=False)
+        out_linear_weight = paddle.to_tensor(
+            self.out_proj.weight, stop_gradient=False)
+        ffn1_weight = paddle.to_tensor(
+            self.ffn1_proj.weight, stop_gradient=False)
+        ffn2_weight = paddle.to_tensor(
+            self.ffn2_proj.weight, stop_gradient=False)
+
+        if self.bias_attr is False:
+            qkv_bias_tensor = None
+            out_linear_bias = None
+        else:
+            q_proj_bias = paddle.to_tensor(
+                self.q_proj.bias, stop_gradient=False)
+            k_proj_bias = paddle.to_tensor(
+                self.k_proj.bias, stop_gradient=False)
+            v_proj_bias = paddle.to_tensor(
+                self.v_proj.bias, stop_gradient=False)
+            qkv_bias = np.concatenate(
+                (q_proj_bias.numpy(), k_proj_bias.numpy(), v_proj_bias.numpy()))
+            qkv_bias = qkv_bias.reshape((3, self.num_heads, self.head_dim))
+            qkv_bias_tensor = paddle.to_tensor(qkv_bias, stop_gradient=False)
+            out_linear_bias = paddle.to_tensor(
+                self.out_proj.bias, stop_gradient=False)
+            ffn1_bias = paddle.to_tensor(
+                self.ffn1_proj.bias, stop_gradient=False)
+            ffn2_bias = paddle.to_tensor(
+                self.ffn2_proj.bias, stop_gradient=False)
+
+        ln_scale = paddle.to_tensor(self.norm.weight, stop_gradient=False)
+        ln_bias = paddle.to_tensor(self.norm.bias, stop_gradient=False)
+        ffn_ln_scale = paddle.to_tensor(
+            self.ffn_norm.weight, stop_gradient=False)
+        ffn_ln_bias = paddle.to_tensor(self.ffn_norm.bias, stop_gradient=False)
+
+        q_proj_weight = q_proj_weight.numpy().transpose((1, 0))
+        k_proj_weight = k_proj_weight.numpy().transpose((1, 0))
+        v_proj_weight = v_proj_weight.numpy().transpose((1, 0))
+        qkv_weight = np.concatenate(
+            (q_proj_weight, k_proj_weight, v_proj_weight))
+        qkv_weight = qkv_weight.reshape(
+            (3, self.num_heads, self.head_dim, self.embed_dim))
+
+        x = paddle.to_tensor(self.query, stop_gradient=False)
+        cache_kvs, cache_kv = None, None
+        time_step = None
+        if self.has_cache_kv:
+            cache_kvs = []
+
+            max_seq_length = (self.cache_length + 128) // 128 * 128
+            cache_kv = np.zeros(
+                [
+                    2, self.batch_size, self.num_heads, max_seq_length,
+                    self.head_dim
+                ],
+                dtype=self.x_type)
+
+            elems = 4
+            if self.x_type is np.float16:
+                elems = 8
+
+            assert self.head_dim % elems == 0
+            v_elems = self.head_dim // elems
+
+            # [B, num_head, 128, head_dim]
+            # cache_k_tmp = self.cache_kv[0, :]
+            # [B, num_head, 128, head_dim / 4, 4]
+            cache_k_tmp = self.cache_kv[0].reshape([
+                self.batch_size, self.num_heads, self.cache_length, v_elems,
+                elems
+            ])
+            # [B, num_head, head_dim / 4, 128, 4]
+            cache_k_tmp = cache_k_tmp.transpose([0, 1, 3, 2, 4])
+
+            cache_kv[0, :].reshape([
+                self.batch_size, self.num_heads, v_elems, max_seq_length, elems
+            ])[:, :, :, :self.cache_length, :] = cache_k_tmp
+
+            cache_kv[1, :, :, :self.cache_length, :] = self.cache_kv[1]
+            if self.gen_cache_kv:
+                assert self.query_length == self.cache_length
+                cache_kv[:] = 0
+            else:
+                time_step = paddle.to_tensor(
+                    [self.cache_length], dtype='int32', place=paddle.CPUPlace())
+        if self.has_attn_mask:
+            attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
+        else:
+            attn_mask = None
+        qkv_weight_tensor = paddle.to_tensor(qkv_weight, stop_gradient=False)
+        epsilon = 1e-05
+        ln2_epsilon = 1e-05
+
+        if attn_mask is not None:
+            attn_mask = _convert_attention_mask(attn_mask, x.dtype)
+
+        qkv_weights, qkv_biases = [], []
+        out_weights, out_biases = [], []
+        ln_scales, ln_biases = [], []
+        ffn1_weights, ffn1_biases = [], []
+        ffn2_weights, ffn2_biases = [], []
+        ffn_ln_scales, ffn_ln_biases = [], []
+        for i in range(self.layers):
+            qkv_weights.append(qkv_weight_tensor)
+            qkv_biases.append(qkv_bias_tensor)
+            out_weights.append(out_linear_weight)
+            out_biases.append(out_linear_bias)
+            ln_scales.append(ln_scale)
+            ln_biases.append(ln_bias)
+            ffn1_weights.append(ffn1_weight)
+            ffn1_biases.append(ffn1_bias)
+            ffn2_weights.append(ffn2_weight)
+            ffn2_biases.append(ffn2_bias)
+            ffn_ln_scales.append(ffn_ln_scale)
+            ffn_ln_biases.append(ffn_ln_bias)
+            if self.has_cache_kv:
+                cache_kvs.append(
+                    paddle.to_tensor(
+                        cache_kv, stop_gradient=False))
+
+        final_out = fused_multi_transformer(
+            x,
+            ln_scales,
+            ln_biases,
+            qkv_weights,
+            qkv_biases,
+            out_weights,
+            out_biases,
+            ffn_ln_scales,
+            ffn_ln_biases,
+            ffn1_weights,
+            ffn1_biases,
+            ffn2_weights,
+            ffn2_biases,
+            pre_layer_norm=self.pre_layer_norm,
+            epsilon=epsilon,
+            cache_kvs=cache_kvs,
+            time_step=time_step,
+            attn_mask=attn_mask,
+            dropout_rate=self.dropout_prob,
+            training=self.training)
+
+        if self.has_cache_kv:
+            return final_out[0], final_out[1]
+
+        return final_out
+
+    def test_fused_multi_transformer_op(self):
+        final_out_ref = self.GetBaselineOut()
+        final_out = self.GetFusedMultiTransformerOut()
+        if self.has_cache_kv:
+            final_out, cache_kv_out = final_out
+            s = cache_kv_out[0].shape
+            bsz = s[1]
+            num_head = s[2]
+            max_seq_len = s[3]
+            head_dim = s[4]
+            elems = 8 if self.x_type is np.float16 else 4
+            v_elems = head_dim // elems
+
+            if self.debug:
+                print("cache_k out timestep=128")
+                print(cache_kv_out[0].reshape([
+                    2, bsz, num_head, v_elems, max_seq_len, elems
+                ])[0, 0, 0, :, self.cache_length, :])
+
+                print("cache_v out timestep=128")
+                print(cache_kv_out[0][1, 0, 0, self.cache_length, :])
+
+            if self.gen_cache_kv:
+                final_out_ref, cache_kvs = final_out_ref
+                for i in range(self.layers):
+                    cache_k_ref = cache_kvs[i][0]
+                    cache_v_ref = cache_kvs[i][1]
+
+                    cache_k = cache_kv_out[i][0, :]
+                    cache_k = cache_k.reshape(
+                        [bsz, num_head, v_elems, max_seq_len, elems])
+                    cache_k = cache_k[:, :, :, :self.cache_length, :]
+                    cache_k = cache_k.transpose([0, 1, 3, 2, 4])
+                    cache_k = cache_k.reshape(
+                        [bsz, num_head, self.cache_length, head_dim])
+
+                    cache_v = cache_kv_out[i][1, :, :, :self.cache_length, :]
+
+                    np.testing.assert_allclose(
+                        cache_k_ref, cache_k, rtol=self.rtol, atol=self.atol)
+                    np.testing.assert_allclose(
+                        cache_v_ref, cache_v, rtol=self.rtol, atol=self.atol)
+                    if i == 0:
+                        break
+
+        np.testing.assert_allclose(
+            final_out_ref, final_out, rtol=self.rtol, atol=self.atol)
+
+
+class TestFusedMultiTransformerOpFp16(TestFusedMultiTransformerOp):
+    def config(self):
+        super().config()
+        self.x_type = np.float16
+        self.layers = 3  # odd layers
+
+
+class TestFusedMultiTransformerOpCacheKV(TestFusedMultiTransformerOp):
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+        self.layers = 3  # odd layers
+
+
+class TestFusedMultiTransformerOpCacheKVFp16(TestFusedMultiTransformerOp):
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+        self.x_type = np.float16
+
+
+class TestFusedMultiTransformerOpGenCacheKV(TestFusedMultiTransformerOp):
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.gen_cache_kv = True
+
+
+class TestFusedMultiTransformerOpGenCacheKVFp16(TestFusedMultiTransformerOp):
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.gen_cache_kv = True
+        self.x_type = np.float16
+        self.layers = 3  # odd layers
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 18620f55367f6..d200b77eea83f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -919,7 +919,7 @@ def train(layer, loader, loss_fn, opt):
 
         # load_inference_model
         paddle.enable_static()
-        exe = paddle.static.Executor(paddle.CPUPlace())
+        exe = paddle.static.Executor()
         [inference_program, feed_target_names, fetch_targets] = (
             paddle.static.load_inference_model(path, exe))
         tensor_img = x
@@ -927,8 +927,8 @@ def train(layer, loader, loss_fn, opt):
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
         print("pred.numpy()", pred.numpy())
-        print("results", results)
-        self.assertTrue(np.allclose(pred.numpy(), results, atol=1.e-5))
+        print("result", results[0])
+        self.assertTrue(np.array_equal(pred.numpy(), results[0]))
         paddle.disable_static()
 
     def test_inference_save_load(self):
@@ -1254,18 +1254,17 @@ def train(self, enable_amp=True, amp_level='O1'):
 
     def test_bf16(self):
         def func_isinstance():
-            if fluid.core.is_compiled_with_cuda():
-                cudnn_version = paddle.device.get_cudnn_version()
-                if cudnn_version is not None and cudnn_version >= 8100:
-                    out_fp32 = self.train(enable_amp=False)
-                    out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
-                    out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
-                    self.assertTrue(
-                        np.allclose(
-                            out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
-                    self.assertTrue(
-                        np.allclose(
-                            out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
+            if fluid.core.is_compiled_with_cuda(
+            ) and fluid.core.is_bfloat16_supported(paddle.CUDAPlace(0)):
+                out_fp32 = self.train(enable_amp=False)
+                out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
+                out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
+                self.assertTrue(
+                    np.allclose(
+                        out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
+                self.assertTrue(
+                    np.allclose(
+                        out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
 
         with _test_eager_guard():
             func_isinstance()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 3a9387082e680..52137b22a790c 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -1037,11 +1037,11 @@ def func_dirac(self):
             block = start_prog.global_block()
             self.assertEqual(len(block.ops), self.num_ops)
             self.assertEqual(block.ops[0].type, 'fill_constant')
-            self.assertEqual(block.ops[1].type, 'reshape')
+            self.assertEqual(block.ops[1].type, 'reshape2')
             self.assertEqual(block.ops[2].type, 'assign_value')
             self.assertEqual(block.ops[3].type, 'assign_value')
             self.assertEqual(block.ops[4].type, 'scatter')
-            self.assertEqual(block.ops[5].type, 'reshape')
+            self.assertEqual(block.ops[5].type, 'reshape2')
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py b/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py
index 83c8ced79b1e8..54f5e64fda4b6 100644
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py
+++ b/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py
@@ -19,8 +19,6 @@
 import paddle.nn.functional as F
 import paddle.fluid.initializer as I
 import unittest
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
 
 
 class LabelSmoothTestCase(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
index c71ff4381028d..a1440f8587ab6 100644
--- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -16,6 +16,10 @@
 import unittest
 import numpy
 import paddle.nn.functional as F
+import tempfile
+import warnings
+import json
+import os
 
 
 class SimpleNet(paddle.nn.Layer):
@@ -41,10 +45,18 @@ def forward(self, image):
 class LayoutAutoTune(unittest.TestCase):
     def use_autoune(self):
         if paddle.is_compiled_with_cuda():
-            paddle.fluid.core.enable_layout_autotune()
+            paddle.incubate.autotune.set_config(
+                config={"layout": {
+                    "enable": True
+                }})
             return paddle.fluid.core.use_layout_autotune()
         else:
-            paddle.fluid.core.disable_layout_autotune()
+            config = {"layout": {"enable": False}}
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
             return paddle.fluid.core.use_layout_autotune()
 
     def train(self, data_format):
@@ -103,7 +115,6 @@ def test_transpose_op_transposer(self):
     def test_flatten_op_transposer(self):
         if not self.use_autoune():
             return
-        paddle.fluid.core.enable_layout_autotune()
         conv = paddle.nn.Conv2D(3, 8, (3, 3))
         flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
         data = paddle.rand([1, 3, 16, 14])
@@ -119,5 +130,20 @@ def test_flatten_op_transposer(self):
         self.assertEqual(out.shape, [1, 112, 12])
 
 
+class TestAutoTuneAPI(unittest.TestCase):
+    def test_set_config_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            config = {"layout": {"enable": 1}}
+            # On linux, we can open the file again to read the content
+            # without closing the file, but on windows system, there is
+            # no permission to open it again without closing it.
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
+            self.assertTrue(len(w) == 1)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs.py b/python/paddle/fluid/tests/unittests/test_lbfgs.py
index 2cad4822b28b1..bb3818747601f 100644
--- a/python/paddle/fluid/tests/unittests/test_lbfgs.py
+++ b/python/paddle/fluid/tests/unittests/test_lbfgs.py
@@ -21,9 +21,6 @@
 
 from paddle.incubate.optimizer.functional.lbfgs import minimize_lbfgs
 
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
-
 np.random.seed(123)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py b/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py
index e5ec67d41f7ef..d273185ad185f 100644
--- a/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py
@@ -17,6 +17,7 @@
 import numpy as np
 from paddle.distributed.models.moe import utils
 from paddle.fluid import core
+from paddle.fluid.framework import _test_eager_guard
 
 
 def limit_by_capacity(expert_count, _capacity, n_worker):
@@ -77,7 +78,7 @@ def test_static_api(self):
 
         assert all_close(self.out, res[0], self.n_worker)
 
-    def test_dygraph_api(self):
+    def func_dygraph_api(self):
         paddle.disable_static(self.place)
         capacity = paddle.to_tensor(self.capacity)
         expert_count_tensor = paddle.to_tensor(self.expert_count)
@@ -85,6 +86,11 @@ def test_dygraph_api(self):
                                        self.n_worker)
         assert all_close(self.out, out.numpy(), self.n_worker)
 
+    def test_dygraph_api(self):
+        with _test_eager_guard():
+            self.func_dygraph_api()
+        self.func_dygraph_api()
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 492f300e3b848..3e06b69278d34 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -385,9 +385,9 @@ def test_check_grad(self):
 
 def create_test_bf16_class(parent, atol=0.01):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
-        "core is not compiled with CUDA and cudnn version need larger than 8.1.0"
-    )
+        not core.is_compiled_with_cuda() or
+        not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and not support the bfloat16")
     class TestMatMulOpBf16Case(parent):
         def get_numeric_grad(self, place, check_name):
             scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
index ecde527523d3d..4dfc881d7723f 100644
--- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
@@ -216,6 +216,14 @@ def test_dim_less_than_1():
 
         self.assertRaises(ValueError, test_dim_less_than_1)
 
+        with self.assertRaises(ValueError):
+            y = paddle.multinomial(paddle.to_tensor([1., 2., -3.]))
+
+        with self.assertRaises(ValueError):
+            prob = paddle.rand([20, 1000])
+            prob[1:0] = 0
+            y = paddle.multinomial(prob)
+
 
 class TestRandomValue(unittest.TestCase):
     def test_fixed_random_number(self):
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
index 9b11f6711afc1..84559048a2b8a 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -20,8 +20,6 @@
 import sys
 import subprocess
 import paddle
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
index e50424126e53e..0b5493e21705f 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
@@ -19,14 +19,13 @@
 import paddle
 import paddle.nn as nn
 import numpy as np
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import _test_eager_guard
 
 paddle.disable_static()
 
 
 class EmbeddingDygraph(unittest.TestCase):
-    def test_1(self):
+    def func_1(self):
         x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
         paddle.disable_static(paddle.CPUPlace())
         x = paddle.to_tensor(x_data, stop_gradient=False)
@@ -44,7 +43,12 @@ def test_1(self):
         out.backward()
         adam.step()
 
-    def test_2(self):
+    def test_1(self):
+        with _test_eager_guard():
+            self.func_1()
+        self.func_1()
+
+    def func_2(self):
         x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
         y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
         paddle.disable_static(paddle.CPUPlace())
@@ -60,6 +64,11 @@ def test_2(self):
         with self.assertRaises(ValueError):
             embedding = paddle.nn.Embedding(10, -3, sparse=True)
 
+    def test_2(self):
+        with _test_eager_guard():
+            self.func_2()
+        self.func_2()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index 49fe397644dc6..1452b869d4f8b 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -43,6 +43,7 @@ def func(self, place):
                 [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
 
     def test_grad(self):
+        paddle.enable_static()
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
@@ -77,6 +78,14 @@ def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = False
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 1
+
+    def batch_norm_wrapper(self, x):
+        batch_norm = paddle.nn.BatchNorm2D(
+            self.shape[self.channel_index],
+            data_format=self.data_layout,
+            use_global_stats=self.use_global_stats)
+        return batch_norm(x[0])
 
     @prog_scope()
     def func(self, place):
@@ -94,8 +103,15 @@ def func(self, place):
             x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
             gradient_checker.double_grad_check(
                 [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+            gradient_checker.double_grad_check_for_dygraph(
+                self.batch_norm_wrapper, [x],
+                z,
+                x_init=x_arr,
+                atol=atol,
+                place=place)
 
     def test_grad(self):
+        paddle.enable_static()
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
@@ -108,6 +124,7 @@ def init_test(self):
         self.data_layout = 'NHWC'
         self.use_global_stats = False
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 3
 
 
 class TestBatchNormDoubleGradCheckCase2(TestBatchNormDoubleGradCheck):
@@ -115,6 +132,7 @@ def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = True
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 1
 
 
 class TestBatchNormDoubleGradCheckCase3(TestBatchNormDoubleGradCheck):
@@ -122,6 +140,7 @@ def init_test(self):
         self.data_layout = 'NHWC'
         self.use_global_stats = True
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 3
 
 
 class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
@@ -129,6 +148,14 @@ def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = False
         self.shape = [2, 2, 3, 4, 5]
+        self.channel_index = 1
+
+    def batch_norm_wrapper(self, x):
+        batch_norm = paddle.nn.BatchNorm3D(
+            self.shape[self.channel_index],
+            data_format=self.data_layout,
+            use_global_stats=self.use_global_stats)
+        return batch_norm(x[0])
 
 
 class TestBatchNormDoubleGradCheckCase5(TestBatchNormDoubleGradCheck):
@@ -165,8 +192,8 @@ def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = True
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 1
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_number_count_op.py b/python/paddle/fluid/tests/unittests/test_number_count_op.py
index 9eb89dfeb0e8d..bb09b8c6512f7 100644
--- a/python/paddle/fluid/tests/unittests/test_number_count_op.py
+++ b/python/paddle/fluid/tests/unittests/test_number_count_op.py
@@ -24,6 +24,7 @@
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid.backward import append_backward
 from paddle.distributed.models.moe import utils
+from paddle.fluid.framework import _test_eager_guard
 
 
 def count(x, upper_num):
@@ -68,12 +69,17 @@ def test_api_static(self):
             res = exe.run(feed={'x': self.x}, fetch_list=[out])
             assert np.allclose(res, self.out)
 
-    def test_api_dygraph(self):
+    def func_api_dygraph(self):
         paddle.disable_static()
         x = paddle.to_tensor(self.x)
         out = utils._number_count(x, self.upper_num)
         assert np.allclose(out.numpy(), self.out)
 
+    def test_api_dygraph(self):
+        with _test_eager_guard():
+            self.func_api_dygraph()
+        self.func_api_dygraph()
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py
new file mode 100644
index 0000000000000..768a9e307c91e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def pixel_unshuffle_np(x, down_factor, data_format="NCHW"):
+    '''Numpy implementation of pixel unshuffle'''
+
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
+        new_shape = (n, c, h // down_factor, down_factor, w // down_factor,
+                     down_factor)
+        npresult = np.reshape(x, new_shape)
+        npresult = npresult.transpose(0, 1, 3, 5, 2, 4)
+        oshape = [
+            n, c * down_factor * down_factor, h // down_factor, w // down_factor
+        ]
+        npresult = np.reshape(npresult, oshape)
+        return npresult
+    else:
+        n, h, w, c = x.shape
+        new_shape = (n, h // down_factor, down_factor, w // down_factor,
+                     down_factor, c)
+        npresult = np.reshape(x, new_shape)
+        npresult = npresult.transpose(0, 1, 3, 5, 2, 4)
+        oshape = [
+            n, h // down_factor, w // down_factor, c * down_factor * down_factor
+        ]
+        npresult = np.reshape(npresult, oshape)
+        return npresult
+
+
+class TestPixelUnshuffleOp(OpTest):
+    '''TestPixelUnshuffleOp'''
+
+    def setUp(self):
+        '''setUp'''
+
+        self.op_type = "pixel_unshuffle"
+        self.init_data_format()
+        n, c, h, w = 2, 1, 12, 12
+
+        if self.format == "NCHW":
+            shape = [n, c, h, w]
+        if self.format == "NHWC":
+            shape = [n, h, w, c]
+
+        down_factor = 3
+
+        x = np.random.random(shape).astype("float64")
+        npresult = pixel_unshuffle_np(x, down_factor, self.format)
+
+        self.inputs = {"X": x}
+        self.outputs = {"Out": npresult}
+        self.attrs = {
+            "downscale_factor": down_factor,
+            "data_format": self.format
+        }
+
+    def init_data_format(self):
+        '''init_data_format'''
+
+        self.format = "NCHW"
+
+    def test_check_output(self):
+        '''test_check_output'''
+
+        self.check_output()
+
+    def test_check_grad(self):
+        '''test_check_grad'''
+
+        self.check_grad(["X"], "Out")
+
+
+class TestChannelLast(TestPixelUnshuffleOp):
+    '''TestChannelLast'''
+
+    def init_data_format(self):
+        '''init_data_format'''
+
+        self.format = "NHWC"
+
+
+class TestPixelUnshuffleAPI(unittest.TestCase):
+    '''TestPixelUnshuffleAPI'''
+
+    def setUp(self):
+        '''setUp'''
+
+        self.x_1_np = np.random.random([2, 1, 12, 12]).astype("float64")
+        self.x_2_np = np.random.random([2, 12, 12, 1]).astype("float64")
+        self.out_1_np = pixel_unshuffle_np(self.x_1_np, 3)
+        self.out_2_np = pixel_unshuffle_np(self.x_2_np, 3, "NHWC")
+
+    def test_static_graph_functional(self):
+        '''test_static_graph_functional'''
+
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.fluid.data(
+                name="x", shape=[2, 1, 12, 12], dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=[2, 12, 12, 1], dtype="float64")
+            out_1 = F.pixel_unshuffle(x_1, 3)
+            out_2 = F.pixel_unshuffle(x_2, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_1_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_2_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            assert np.allclose(res_1, self.out_1_np)
+            assert np.allclose(res_2, self.out_2_np)
+
+    # same test between layer and functional in this op.
+    def test_static_graph_layer(self):
+        '''test_static_graph_layer'''
+
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.fluid.data(
+                name="x", shape=[2, 1, 12, 12], dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=[2, 12, 12, 1], dtype="float64")
+            # init instance
+            ps_1 = paddle.nn.PixelUnshuffle(3)
+            ps_2 = paddle.nn.PixelUnshuffle(3, "NHWC")
+            out_1 = ps_1(x_1)
+            out_2 = ps_2(x_2)
+            out_1_np = pixel_unshuffle_np(self.x_1_np, 3)
+            out_2_np = pixel_unshuffle_np(self.x_2_np, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_1_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_2_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            assert np.allclose(res_1, out_1_np)
+            assert np.allclose(res_2, out_2_np)
+
+    def run_dygraph(self, down_factor, data_format):
+        '''run_dygraph'''
+
+        n, c, h, w = 2, 1, 12, 12
+
+        if data_format == "NCHW":
+            shape = [n, c, h, w]
+        if data_format == "NHWC":
+            shape = [n, h, w, c]
+
+        x = np.random.random(shape).astype("float64")
+
+        npresult = pixel_unshuffle_np(x, down_factor, data_format)
+
+        for use_cuda in ([False, True]
+                         if core.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.disable_static(place=place)
+
+            pixel_unshuffle = paddle.nn.PixelUnshuffle(
+                down_factor, data_format=data_format)
+            result = pixel_unshuffle(paddle.to_tensor(x))
+
+            self.assertTrue(np.allclose(result.numpy(), npresult))
+
+            result_functional = F.pixel_unshuffle(
+                paddle.to_tensor(x), 3, data_format)
+            self.assertTrue(np.allclose(result_functional.numpy(), npresult))
+
+            pixel_unshuffle_str = 'downscale_factor={}'.format(down_factor)
+            if data_format != 'NCHW':
+                pixel_unshuffle_str += ', data_format={}'.format(data_format)
+            self.assertEqual(pixel_unshuffle.extra_repr(), pixel_unshuffle_str)
+
+    def test_dygraph1(self):
+        '''test_dygraph1'''
+
+        self.run_dygraph(3, "NCHW")
+
+    def test_dygraph2(self):
+        '''test_dygraph2'''
+
+        self.run_dygraph(3, "NHWC")
+
+
+class TestPixelUnshuffleError(unittest.TestCase):
+    '''TestPixelUnshuffleError'''
+
+    def test_error_functional(self):
+        '''test_error_functional'''
+
+        def error_input():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([4, 12, 12]).astype("float64")
+                pixel_unshuffle = F.pixel_unshuffle(paddle.to_tensor(x), 2)
+
+        self.assertRaises(ValueError, error_input)
+
+        def error_downscale_factor_1():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 1, 12, 12]).astype("float64")
+                pixel_unshuffle = F.pixel_unshuffle(paddle.to_tensor(x), 3.33)
+
+        self.assertRaises(TypeError, error_downscale_factor_1)
+
+        def error_downscale_factor_2():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 1, 12, 12]).astype("float64")
+                pixel_unshuffle = F.pixel_unshuffle(paddle.to_tensor(x), -1)
+
+        self.assertRaises(ValueError, error_downscale_factor_2)
+
+        def error_data_format():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 1, 12, 12]).astype("float64")
+                pixel_unshuffle = F.pixel_unshuffle(
+                    paddle.to_tensor(x), 3, "WOW")
+
+        self.assertRaises(ValueError, error_data_format)
+
+    def test_error_layer(self):
+        '''test_error_layer'''
+
+        def error_input_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([4, 12, 12]).astype("float64")
+                ps = paddle.nn.PixelUnshuffle(2)
+                ps(paddle.to_tensor(x))
+
+        self.assertRaises(ValueError, error_input_layer)
+
+        def error_downscale_factor_layer_1():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 1, 12, 12]).astype("float64")
+                ps = paddle.nn.PixelUnshuffle(3.33)
+
+        self.assertRaises(TypeError, error_downscale_factor_layer_1)
+
+        def error_downscale_factor_layer_2():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 1, 12, 12]).astype("float64")
+                ps = paddle.nn.PixelUnshuffle(-1)
+
+        self.assertRaises(ValueError, error_downscale_factor_layer_2)
+
+        def error_data_format_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 1, 12, 12]).astype("float64")
+                ps = paddle.nn.PixelUnshuffle(3, "MEOW")
+
+        self.assertRaises(ValueError, error_data_format_layer)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py b/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py
index d9d110f45ff79..8a641a6b4faf9 100644
--- a/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py
@@ -17,6 +17,7 @@
 import numpy as np
 from paddle.distributed.models.moe import utils
 from paddle.fluid import core
+from paddle.fluid.framework import _test_eager_guard
 
 
 def count(x, upper_num):
@@ -102,7 +103,7 @@ def test_static_api(self):
                           fetch_list=out)
         assert_allclose(res[0], self.out, self.n_expert)
 
-    def test_dygraph_api(self):
+    def func_dygraph_api(self):
         paddle.disable_static(self.place)
         gate_idx_tensor = paddle.to_tensor(self.gate_idx)
         expert_count_tensor = paddle.to_tensor(self.expert_count)
@@ -110,6 +111,11 @@ def test_dygraph_api(self):
             gate_idx_tensor, expert_count_tensor, self.n_expert, self.n_worker)
         assert_allclose(out.numpy(), self.out, self.n_expert)
 
+    def test_dygraph_api(self):
+        with _test_eager_guard():
+            self.func_dygraph_api()
+        self.func_dygraph_api()
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_qr_op.py b/python/paddle/fluid/tests/unittests/test_qr_op.py
index 4be46837a67ae..ecf65d16d3431 100644
--- a/python/paddle/fluid/tests/unittests/test_qr_op.py
+++ b/python/paddle/fluid/tests/unittests/test_qr_op.py
@@ -27,7 +27,7 @@
 class TestQrOp(OpTest):
     def setUp(self):
         paddle.enable_static()
-        np.random.seed(4)
+        np.random.seed(7)
         self.op_type = "qr"
         a, q, r = self.get_input_and_output()
         self.inputs = {"X": a}
@@ -74,7 +74,8 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], ['Q', 'R'])
+        self.check_grad(
+            ['X'], ['Q', 'R'], numeric_grad_delta=1e-5, max_relative_error=1e-6)
 
 
 class TestQrOpCase1(TestQrOp):
@@ -116,6 +117,7 @@ def get_shape(self):
 class TestQrAPI(unittest.TestCase):
     def test_dygraph(self):
         paddle.disable_static()
+        np.random.seed(7)
 
         def run_qr_dygraph(shape, mode, dtype):
             if dtype == "float32":
@@ -180,6 +182,7 @@ def run_qr_dygraph(shape, mode, dtype):
 
     def test_static(self):
         paddle.enable_static()
+        np.random.seed(7)
 
         def run_qr_static(shape, mode, dtype):
             if dtype == "float32":
diff --git a/python/paddle/fluid/tests/unittests/test_random_routing_op.py b/python/paddle/fluid/tests/unittests/test_random_routing_op.py
index dc8f6f5fcec15..e4bb7c5ca5fd8 100644
--- a/python/paddle/fluid/tests/unittests/test_random_routing_op.py
+++ b/python/paddle/fluid/tests/unittests/test_random_routing_op.py
@@ -24,6 +24,7 @@
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid.backward import append_backward
 from paddle.distributed.models.moe import utils
+from paddle.fluid.framework import _test_eager_guard
 
 
 def random_routing(topk_idx, topk_value, prob, topk=2):
@@ -55,7 +56,7 @@ def init(self):
                                   self.prob).astype(self.dtype)
         self.place = paddle.CUDAPlace(0)
 
-    def test_api_dygraph(self):
+    def func_api_dygraph(self):
         paddle.disable_static()
         x = paddle.to_tensor(self.x)
         value = paddle.to_tensor(self.topk_value)
@@ -63,6 +64,11 @@ def test_api_dygraph(self):
         out = utils._random_routing(x, value, prob)
         assert np.allclose(out.numpy(), self.out)
 
+    def test_api_dygraph(self):
+        with _test_eager_guard():
+            self.func_api_dygraph()
+        self.func_api_dygraph()
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
new file mode 100644
index 0000000000000..8d65a4c4444d4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard
+import copy
+
+
+class TestMaxPool3DFunc(unittest.TestCase):
+    def setInput(self):
+        paddle.seed(0)
+        self.dense_x = paddle.randn((1, 4, 4, 4, 4))
+
+    def setKernelSize(self):
+        self.kernel_sizes = [3, 3, 3]
+
+    def setStride(self):
+        self.strides = [1, 1, 1]
+
+    def setPadding(self):
+        self.paddings = [0, 0, 0]
+
+    def setUp(self):
+        self.setInput()
+        self.setKernelSize()
+        self.setStride()
+        self.setPadding()
+
+    def test(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.dense_x.stop_gradient = False
+            sparse_x = self.dense_x.to_sparse_coo(4)
+            sparse_out = paddle.sparse.functional.max_pool3d(
+                sparse_x,
+                self.kernel_sizes,
+                stride=self.strides,
+                padding=self.paddings)
+            out = sparse_out.to_dense()
+            out.backward(out)
+
+            dense_x = copy.deepcopy(self.dense_x)
+            dense_out = paddle.nn.functional.max_pool3d(
+                dense_x,
+                self.kernel_sizes,
+                stride=self.strides,
+                padding=self.paddings,
+                data_format='NDHWC')
+            dense_out.backward(dense_out)
+
+            #compare with dense
+            assert np.allclose(dense_out.numpy(), out.numpy())
+            assert np.allclose(dense_x.grad.numpy(), self.dense_x.grad.numpy())
+
+
+class TestStride(TestMaxPool3DFunc):
+    def setStride(self):
+        self.strides = 1
+
+
+class TestPadding(TestMaxPool3DFunc):
+    def setPadding(self):
+        self.paddings = 1
+
+    def setInput(self):
+        self.dense_x = paddle.randn((1, 5, 6, 8, 3))
+
+
+class TestKernelSize(TestMaxPool3DFunc):
+    def setKernelSize(self):
+        self.kernel_sizes = [5, 5, 5]
+
+    def setInput(self):
+        paddle.seed(0)
+        self.dense_x = paddle.randn((1, 6, 9, 6, 3))
+
+
+class TestInput(TestMaxPool3DFunc):
+    def setInput(self):
+        paddle.seed(0)
+        self.dense_x = paddle.randn((2, 6, 7, 9, 3))
+        dropout = paddle.nn.Dropout(0.8)
+        self.dense_x = dropout(self.dense_x)
+
+
+class TestMaxPool3DAPI(unittest.TestCase):
+    def test(self):
+        with _test_eager_guard():
+            dense_x = paddle.randn((2, 3, 6, 6, 3))
+            sparse_x = dense_x.to_sparse_coo(4)
+            max_pool3d = paddle.sparse.MaxPool3D(
+                kernel_size=3, data_format='NDHWC')
+            out = max_pool3d(sparse_x)
+            out = out.to_dense()
+
+            dense_out = paddle.nn.functional.max_pool3d(
+                dense_x, 3, data_format='NDHWC')
+            assert np.allclose(dense_out.numpy(), out.numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_multi_transformer.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_multi_transformer.py
new file mode 100644
index 0000000000000..5475fd4a10a13
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_multi_transformer.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel_fused_multi_transformer(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_fused_multi_transformer.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
index 1775272aac69d..0049a922b9166 100644
--- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
@@ -15,6 +15,10 @@
 import paddle
 import unittest
 import numpy as np
+import tempfile
+import warnings
+import json
+import os
 
 
 class SimpleNet(paddle.nn.Layer):
@@ -73,10 +77,13 @@ def get_expected_res(self, step_id, enable_autotune):
         return expected_res
 
     def test_autotune(self):
-        paddle.fluid.core.disable_autotune()
+        paddle.incubate.autotune.set_config(
+            config={"kernel": {
+                "enable": False
+            }})
         self.assertEqual(self.get_flags("FLAGS_use_autotune"), False)
 
-        paddle.fluid.core.enable_autotune()
+        paddle.incubate.autotune.set_config(config={"kernel": {"enable": True}})
         self.assertEqual(self.get_flags("FLAGS_use_autotune"), True)
 
     def check_status(self, expected_res):
@@ -93,10 +100,16 @@ class TestDygraphAutoTuneStatus(TestAutoTune):
     def run_program(self, enable_autotune):
         self.set_flags(enable_autotune)
         if enable_autotune:
-            paddle.fluid.core.enable_autotune()
+            paddle.incubate.autotune.set_config(
+                config={"kernel": {
+                    "enable": True,
+                    "tuning_range": [1, 2]
+                }})
         else:
-            paddle.fluid.core.disable_autotune()
-        paddle.fluid.core.set_autotune_range(1, 2)
+            paddle.incubate.autotune.set_config(
+                config={"kernel": {
+                    "enable": False
+                }})
         x_var = paddle.uniform((1, 1, 8, 8), dtype='float32', min=-1., max=1.)
         net = SimpleNet()
         for i in range(3):
@@ -141,10 +154,18 @@ def run_program(self, enable_autotune):
 
         self.set_flags(enable_autotune)
         if enable_autotune:
-            paddle.fluid.core.enable_autotune()
+            config = {"kernel": {"enable": True, "tuning_range": [1, 2]}}
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
         else:
-            paddle.fluid.core.disable_autotune()
-        paddle.fluid.core.set_autotune_range(1, 2)
+            paddle.incubate.autotune.set_config(
+                config={"kernel": {
+                    "enable": False,
+                    "tuning_range": [1, 2]
+                }})
 
         for i in range(3):
             exe.run(program=main_program, feed={'X': x}, fetch_list=[loss])
@@ -166,5 +187,22 @@ def test_disable_autotune(self):
         self.func_disable_autotune()
 
 
+class TestAutoTuneAPI(unittest.TestCase):
+    def test_set_config_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            config = {"kernel": {"enable": 1, "tuning_range": 1}}
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
+            self.assertTrue(len(w) == 2)
+
+    def test_set_config_attr(self):
+        paddle.incubate.autotune.set_config(config=None)
+        self.assertEqual(
+            paddle.get_flags("FLAGS_use_autotune")["FLAGS_use_autotune"], True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index a5ca53108fc59..e7f85f0451a17 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -462,11 +462,9 @@ def double_print_hook(grad):
         x.register_hook(double_print_hook)
 
         y = x * x
-        fluid.set_flags({'FLAGS_retain_grad_for_all_tensor': False})
         # Since y = x * x, dx = 2 * x
         dx = paddle.grad(
             outputs=[y], inputs=[x], create_graph=True, retain_graph=True)[0]
-        fluid.set_flags({'FLAGS_retain_grad_for_all_tensor': True})
 
         z = y + dx
         self.assertTrue(x.grad is None)
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index 5deca1dc5acd4..91731c1dd0b21 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -51,6 +51,7 @@
     'matrix_power', \
     'cholesky_solve', \
     'solve', \
+    'qr', \
 ]
 
 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index ea88a89e68224..95ab446e1de6d 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -42,6 +42,7 @@ def __init__(self, nrings):
         self.nrings = nrings
         self.endpoints = None
         self.current_endpoint = None
+        self.other_endpoints = None
         self.nranks = None
         self.rank = None
         self.startup_program = None
@@ -79,6 +80,12 @@ def transpile(self, startup_program, main_program, rank, endpoints,
         self.endpoints = endpoints
         self.current_endpoint = current_endpoint
 
+        if current_endpoint:
+            nranks = len(endpoints)
+            other_endpoints = endpoints[:]
+            other_endpoints.remove(current_endpoint)
+            self.other_endpoints = other_endpoints
+
         self.wait_port = wait_port
 
         self.startup_program._origin_program = self.startup_program.clone()
@@ -462,9 +469,41 @@ def _transpile_startup_program(self):
                     self.rank, ring_id, self.wait_port, True)
 
         else:
-            print("begin to _transpile_startup_program for single-node")
-            block = self.startup_program.global_block()
-            block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
+            if "xpu" in self.trans_mode:
+                print(
+                    "begin to _transpile_startup_program for single-node in XPU")
+                block = self.startup_program.global_block()
+                comm_id_var = block.create_var(
+                    name=unique_name.generate('comm_id'),
+                    persistable=True,
+                    type=core.VarDesc.VarType.RAW)
+                block.append_op(
+                    type='c_gen_bkcl_id',
+                    inputs={},
+                    outputs={'Out': comm_id_var},
+                    attrs={
+                        'rank': self.rank,
+                        'endpoint': self.current_endpoint,
+                        'other_endpoints': self.other_endpoints,
+                        'ring_id': 0,
+                        self.op_role_key: OpRole.Forward
+                    })
+                block.append_op(
+                    type='c_comm_init',
+                    inputs={'X': comm_id_var},
+                    outputs={},
+                    attrs={
+                        'nranks':
+                        len(os.getenv("FLAGS_selected_gpus").split(",")),
+                        'rank': self.rank,
+                        'ring_id': 0,
+                        self.op_role_key: OpRole.Forward
+                    })
+
+            else:
+                print("begin to _transpile_startup_program for single-node")
+                block = self.startup_program.global_block()
+                block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
 
     def _transpile_main_program(self):
         self._insert_scale_loss_grad_ops()
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 4f836d94b34eb..c1891d24b88c9 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -34,6 +34,10 @@
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
+try:
+    from collections.abc import Iterable
+except:
+    from collections import Iterable
 
 __all__ = []
 
@@ -424,7 +428,7 @@ def _parse_every_object(obj, condition_func, convert_func):
     elif type(obj) == set:
         return set(_parse_every_object(list(obj), condition_func, convert_func))
     else:
-        if isinstance(obj, collections.Iterable) and not isinstance(
+        if isinstance(obj, Iterable) and not isinstance(
                 obj,
             (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor)):
             raise NotImplementedError(
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index d8cc322a66e27..ff7a167f1a670 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -29,6 +29,7 @@
 from .tensor import segment_min
 from .passes import fuse_resnet_unit_pass
 import paddle.incubate.autograd
+import paddle.incubate.autotune
 
 from . import nn  #noqa: F401
 
diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py
new file mode 100644
index 0000000000000..e98a23bc52d65
--- /dev/null
+++ b/python/paddle/incubate/autotune.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import json
+import warnings
+from paddle.fluid import core
+
+__all__ = ['set_config']
+
+
+def set_config(config=None):
+    r"""
+    Set the configuration for kernel, layout and dataloader auto-tuning.
+
+    1. kernel: When it is enabled, exhaustive search method will be used to select
+    and cache the best algorithm for the operator in the tuning iteration. Tuning
+    parameters are as follows:
+
+    - enable(bool): Whether to enable kernel tuning.
+    - tuning_range(list): Start and end iteration for auto-tuning. Default: [1, 10].
+
+    2. layout: When it is enabled, the best data layout such as NCHW or NHWC will be
+    determined based on the device and data type. When the origin layout setting is
+    not best, layout transformation will be automaticly performed to improve model
+    performance. Layout auto-tuning only supports dygraph mode currently. Tuning
+    parameters are as follows:
+
+    - enable(bool): Whether to enable layout tuning.
+
+    3. dataloader: When it is enabled, the best num_workers will be selected to replace
+    the origin dataloader setting. Tuning parameters are as follows:
+
+    - enable(bool): Whether to enable dataloader tuning.
+
+    Args:
+        config (dict|str|None, optional): Configuration for auto-tuning. If it is a
+            dictionary, the key is the tuning type, and the value is a dictionary
+            of the corresponding tuning parameters. If it is a string, the path of
+            a json file will be specified and the tuning configuration will be set
+            by the the json file. Default: None, auto-tuning for kernel, layout and
+            dataloader will be enabled.
+
+    Examples:
+        .. code-block:: python
+            :name: auto-tuning
+
+            import paddle
+            import json
+
+            # config is a dict.
+            config = {
+                "kernel": {
+                    "enable": True,
+                    "tuning_range": [1, 5],
+                },
+                "layout": {
+                    "enable": True,
+                },
+                "dataloader": {
+                    "enable": True,
+                }
+            }
+            paddle.incubate.autotune.set_config(config)
+
+            # config is the path of json file.
+            config_json = json.dumps(config)
+            with open('config.json', 'w') as json_file:
+                json_file.write(config_json)
+            paddle.incubate.autotune.set_config('config.json')
+
+    """
+    if config is None:
+        core.enable_autotune()
+        core.enable_layout_autotune()
+        paddle.fluid.reader.set_autotune_config(use_autotune=True)
+        return
+
+    config_dict = {}
+    if isinstance(config, dict):
+        config_dict = config
+    elif isinstance(config, str):
+        try:
+            with open(config, 'r') as filehandle:
+                config_dict = json.load(filehandle)
+        except Exception as e:
+            print('Load config error: {}'.format(e))
+            warnings.warn("Use default configuration for auto-tuning.")
+
+    if "kernel" in config_dict:
+        kernel_config = config_dict["kernel"]
+        if "enable" in kernel_config:
+            if isinstance(kernel_config['enable'], bool):
+                if kernel_config['enable']:
+                    core.enable_autotune()
+                else:
+                    core.disable_autotune()
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the kernel is incorrect."
+                    "The `enable` should be bool. Use default parameter instead."
+                )
+        if "tuning_range" in kernel_config:
+            if isinstance(kernel_config['tuning_range'], list):
+                tuning_range = kernel_config['tuning_range']
+                assert len(tuning_range) == 2
+                core.set_autotune_range(tuning_range[0], tuning_range[1])
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the kernel is incorrect."
+                    "The `tuning_range` should be list. Use default parameter instead."
+                )
+    if "layout" in config_dict:
+        layout_config = config_dict["layout"]
+        if "enable" in layout_config:
+            if isinstance(layout_config['enable'], bool):
+                if layout_config['enable']:
+                    core.enable_layout_autotune()
+                else:
+                    core.disable_layout_autotune()
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the layout is incorrect."
+                    "The `enable` should be bool. Use default parameter instead."
+                )
+    if "dataloader" in config_dict:
+        dataloader_config = config_dict["dataloader"]
+        use_autoune = False
+        if "enable" in dataloader_config:
+            if isinstance(dataloader_config['enable'], bool):
+                use_autoune = dataloader_config['enable']
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the dataloader is incorrect."
+                    "The `enable` should be bool. Use default parameter instead."
+                )
+        if "tuning_steps" in dataloader_config:
+            if isinstance(dataloader_config['tuning_steps'], int):
+                paddle.fluid.reader.set_autotune_config(
+                    use_autoune, dataloader_config['tuning_steps'])
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the dataloader is incorrect."
+                    "The `tuning_steps` should be int. Use default parameter instead."
+                )
+                paddle.fluid.reader.set_autotune_config(use_autoune)
diff --git a/python/paddle/incubate/distributed/models/moe/__init__.py b/python/paddle/incubate/distributed/models/moe/__init__.py
index e1663029ef1f8..fd06b4b8e5287 100644
--- a/python/paddle/incubate/distributed/models/moe/__init__.py
+++ b/python/paddle/incubate/distributed/models/moe/__init__.py
@@ -11,3 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .gate import GShardGate, BaseGate, SwitchGate, NaiveGate
+from .moe_layer import MoELayer
+from .grad_clip import ClipGradForMOEByGlobalNorm
+ClipGradByGlobalNorm = ClipGradForMOEByGlobalNorm
diff --git a/python/paddle/incubate/nn/__init__.py b/python/paddle/incubate/nn/__init__.py
index f359ec1e0d842..43fcabf97317e 100644
--- a/python/paddle/incubate/nn/__init__.py
+++ b/python/paddle/incubate/nn/__init__.py
@@ -15,10 +15,11 @@
 from .layer.fused_transformer import FusedMultiHeadAttention  # noqa: F401
 from .layer.fused_transformer import FusedFeedForward  # noqa: F401
 from .layer.fused_transformer import FusedTransformerEncoderLayer  # noqa: F401
+from .layer.fused_transformer import FusedMultiTransformer  # noqa: F401
 
 __all__ = [  #noqa
     'FusedMultiHeadAttention',
     'FusedFeedForward',
     'FusedTransformerEncoderLayer',
-
+    'FusedMultiTransformer',
 ]
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
index 4d1c3eee025b0..4da090487785b 100644
--- a/python/paddle/incubate/nn/functional/__init__.py
+++ b/python/paddle/incubate/nn/functional/__init__.py
@@ -14,5 +14,10 @@
 
 from .fused_transformer import fused_multi_head_attention
 from .fused_transformer import fused_feedforward
+from .fused_transformer import fused_multi_transformer
 
-__all__ = ['fused_multi_head_attention', 'fused_feedforward']
+__all__ = [
+    'fused_multi_head_attention',
+    'fused_feedforward',
+    'fused_multi_transformer',
+]
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 800d5e832f1ae..3e263f1c6d3ae 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -488,3 +488,238 @@ def fused_multi_head_attention(x,
             attrs=attrs)
 
         return (final_out, cache_kv_out) if cache_kv else final_out
+
+
+def fused_multi_transformer(x,
+                            ln_scales,
+                            ln_biases,
+                            qkv_weights,
+                            qkv_biases,
+                            linear_weights,
+                            linear_biases,
+                            ffn_ln_scales,
+                            ffn_ln_biases,
+                            ffn1_weights,
+                            ffn1_biases,
+                            ffn2_weights,
+                            ffn2_biases,
+                            pre_layer_norm=True,
+                            epsilon=1e-05,
+                            cache_kvs=None,
+                            time_step=None,
+                            attn_mask=None,
+                            dropout_rate=0.0,
+                            activation="gelu",
+                            training=False,
+                            mode='upscale_in_train',
+                            ring_id=-1,
+                            name=None):
+    r"""
+    This is a fusion operator to compute multi transformer layers in transformer model architecture.
+    This operator only supports running on GPU. The function of the transformer layer is consistent
+    with the following pseudo code:
+
+    .. code-block:: python
+
+        if pre_layer_norm:
+            out = layer_norm(x)
+            out = qkv_linear(out) + qkv_bias
+        else:
+            out = qkv_linear(x) + qkv_bias
+        out = transpose(out, perm=[2, 0, 3, 1, 4])
+        # extract q, k and v from out.
+        q = out[0:1, ::]
+        k = out[1:2, ::]
+        v = out[2:3, ::]
+        out = q * k^t
+        out = attn_mask + out
+        out = softmax(out)
+        out = dropout(out)
+        out = out * v
+        out = transpose(out, perm=[0, 2, 1, 3])
+        out = linear(out)
+        if pre_layer_norm:
+            out = x + dropout(out + bias)
+        else:
+            out = layer_norm(x + dropout(out + bias))
+
+        residual = out;
+        if pre_layer_norm:
+            out = ffn_layer_norm(out)
+        out = ffn1_linear(out)
+        out = dropout(activation(out + ffn1_bias))
+        out = ffn2_linear(out)
+        out = residual + dropout(out + ffn2_bias)
+        if not pre_layer_norm:
+            out = ffn_layer_norm(out)
+
+    Args:
+        x (Tensor): the input tensor could be 3-D tensor, the input data type could be float16 or float32, the shape is `[batch\_size, sequence\_length, d\_model]`.
+        ln_scales (list(Tensor)|tuple(Tensor)): The weight tensors of attention layer_norm, the shape is `[d\_model]`.
+        ln_biases (list(Tensor)|tuple(Tensor)): The bias tensors of attention layer_norm. the shape is `[d\_model]`.
+        qkv_weights (list(Tensor)|tuple(Tensor)): The weight tensors of attention qkv computation. The shape is `[3, num\_head, dim\_head, d\_model]`.
+        qkv_biases (list(Tensor)|tuple(Tensor)|None): The bias tensors of attention qkv computation. The shape is `[3, num\_head, dim\_head]`.
+        linear_weights (list(Tensor)|tuple(Tensor)): The weight tensors of attention linear. The shape is `[num\_head * dim\_head, d\_model]`.
+        linear_biases (list(Tensor)|tuple(Tensor)|None): The bias tensors of attention linear. The shape is `[d\_model]`.
+        ffn_ln_scales (list(Tensor)|tuple(Tensor)): The weight tensors of feedforward layer_norm, the shape is `[d\_model]`
+        ffn_ln_biases (list(Tensor)|tuple(Tensor)): The bias tensors of feedforward layer_norm, the shape is `[d\_model]`
+        ffn1_weights (list(Tensor)|tuple(Tensor)): The weight tensors of feedforward first linear, the shape is `[d\_model, dim\_feedforward]`.
+        ffn1_biases (list(Tensor)|tuple(Tensor)|None): The bias tensors of feedforward first linear, the shape is `[dim\_feedforward]`.
+        ffn2_weights (list(Tensor)|tuple(Tensor)): The weight tensors of feedforward second linear, the shape is `[dim\_feedforward, d\_model]`.
+        ffn2_biases (list(Tensor)|tuple(Tensor)|None): The bias tensors of feedforward second linear, the shape is `[d_model]`.
+        pre_layer_norm (bool, optional): whether it is pre_layer_norm(True) or post_layer_norm(False). Default True.
+        epsilon (float, optional): Small float value added to denominator of the layer_norm to avoid dividing by zero. Default is 1e-5.
+        cache_kvs (list(Tensor)|tuple(Tensor), optional): The cache structure tensors for the generation model. The shape is `[2, bsz, num\_head, max\_seq\_len, head\_dim]`. Default None.
+        time_step (Tensor, optional): The time step tensor for the generation model. Which used in decode stage, to represent the time step, that is, the real seq_len of CacheKV. The shape is `[1]`, must be in CPUPlace. Default None.
+        attn_mask (Tensor, optional):  A tensor used in multi-head attention to prevents attention to
+            some unwanted positions, usually the paddings or the subsequent positions. It is a tensor
+            with shape `[batch_size, 1, sequence_length, sequence_length]`. Default None.
+        dropout_rate (float, optional): The dropout probability of setting units to zero. Default 0.0.
+        activation (str, optional): The activation. Default "gelu".
+        training (bool, optional): A flag indicating whether it is in train phrase or not. Default False.
+        mode (str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                               1. upscale_in_train(default), upscale the output at training time
+
+                                  - train: out = input * mask / ( 1.0 - p )
+                                  - inference: out = input
+
+                               2. downscale_in_infer, downscale the output at inference
+
+                                  - train: out = input * mask
+                                  - inference: out = input * (1.0 - p)
+        ring_id (int, optional): For distributed forward in tensor model parallel, only support NCCL. Default is -1, means not using mp.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor|tuple: If `cache_kvs` is None, return a tensor that has
+        the same shape and data type with `x`, representing the output
+        of Transformer layers. If `cache_kvs` is not None, return the
+        tuple (output, cache_kvs), which output is the output of
+        Transformer layers, cache_kvs is inplace with input `cache_kvs`.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            import paddle.incubate.nn.functional as F
+            import numpy as np
+
+            # input: [batch_size, seq_len, embed_dim]
+            x = paddle.rand(shape=(2, 4, 128), dtype="float32")
+
+            # ln_scale: [embed_dim], ln_bias: [embed_dim]
+            ln_scale = paddle.rand(shape=(128,), dtype="float32")
+            ln_bias = paddle.rand(shape=(128,), dtype="float32")
+
+            # qkv_weight: [3, num_head, head_dim, embed_dim], qkv_bias: [3, num_head, head_dim]
+            qkv_weight = paddle.rand(shape=(3, 4, 32, 128), dtype="float32")
+            qkv_bias = paddle.rand(shape=(3, 4, 32), dtype="float32")
+
+            # linear_weight: [embed_dim, embed_dim], linear_bias: [embed_dim]
+            linear_weight = paddle.rand(shape=(128, 128), dtype="float32")
+            linear_bias = paddle.rand(shape=(128,), dtype="float32")
+
+            # ffn_ln_scale: [embed_dim], ffn_ln_bias: [embed_dim]
+            ffn_ln_scale = paddle.rand(shape=(128,), dtype="float32")
+            ffn_ln_bias = paddle.rand(shape=(128,), dtype="float32")
+
+            # ffn1_weight: [embed_dim, 4*embed_dim], ffn1_bias: [4*embed_dim]
+            ffn1_weight = paddle.rand(shape=(128, 4*128), dtype="float32")
+            ffn1_bias = paddle.rand(shape=(4*128,), dtype="float32")
+
+            # ffn2_weight: [4*embed_dim, embed_dim], ffn2_bias: [embed_dim]
+            ffn2_weight = paddle.rand(shape=(4*128, 128), dtype="float32")
+            ffn2_bias = paddle.rand(shape=(128,), dtype="float32")
+
+            # self attention mask: [batch_size, 1, seq_len, seq_len]
+            attn_mask = paddle.rand(shape=(2, 1, 4, 4), dtype="float32")
+
+            # output: [batch_size, seq_len, embed_dim]
+            output = F.fused_multi_transformer(
+                x, [ln_scale], [ln_bias], [qkv_weight], [qkv_bias],
+                [linear_weight], [linear_bias], [ffn_ln_scale], [ffn_ln_bias],
+                [ffn1_weight], [ffn1_bias], [ffn2_weight], [ffn2_bias],
+                attn_mask=attn_mask)
+            # [2, 4, 128]
+            print(output.shape)
+    """
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    if _non_static_mode():
+        cache_kv_out, final_out = _C_ops.fused_multi_transformer(
+            x, ln_scales, ln_biases, qkv_weights, qkv_biases, cache_kvs,
+            time_step, attn_mask, linear_weights, linear_biases, ffn_ln_scales,
+            ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
+            cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
+            'dropout_rate', dropout_rate, 'dropout_is_test', not training,
+            'dropout_implementation', mode, 'act_method', activation, 'ring_id',
+            ring_id)
+        if cache_kvs is not None:
+            return final_out, cache_kv_out
+        return final_out
+    else:
+        helper = LayerHelper('fused_multi_transformer', **locals())
+        dtype = x.dtype
+        # check dtypes
+        check_variable_and_dtype(x, 'x', ['float16', 'float32'],
+                                 'fused_multi_transformer')
+        check_dtype(dtype, 'dtype', ['float16', 'float32'],
+                    'fused_multi_transformer')
+
+        # set inputs
+        inputs = dict()
+        inputs['X'] = [x]
+        inputs['LnScale'] = ln_scales
+        inputs['LnBias'] = ln_biases
+        inputs['QKVW'] = qkv_weights
+        if qkv_biases is not None:
+            inputs['QKVBias'] = qkv_biases
+        if cache_kvs is not None:
+            assert len(cache_kvs) == len(qkv_weights)
+            inputs['CacheKV'] = cache_kvs
+            if time_step is not None:
+                inputs['TimeStep'] = time_step
+        inputs['SrcMask'] = attn_mask
+        inputs['OutLinearW'] = linear_weights
+        if linear_biases is not None:
+            inputs['OutLinearBias'] = linear_biases
+
+        inputs['FFNLnScale'] = ffn_ln_scales
+        inputs['FFNLnBias'] = ffn_ln_biases
+        inputs['FFN1Weight'] = ffn1_weights
+        if ffn1_biases is not None:
+            inputs['FFN1Bias'] = ffn1_biases
+        inputs['FFN2Weight'] = ffn2_weights
+        if ffn2_biases is not None:
+            inputs['FFN2Bias'] = ffn2_biases
+
+        # set attrs
+        attrs = {
+            'pre_layer_norm': pre_layer_norm,
+            'epsilon': epsilon,
+            'dropout_rate': dropout_rate,
+            'dropout_is_test': not training,
+            'dropout_implementation': mode,
+            'act_method': activation,
+            'ring_id': ring_id
+        }
+
+        outputs = dict()
+        final_out = helper.create_variable_for_type_inference(dtype=dtype)
+        outputs['Out'] = final_out
+        if cache_kvs:
+            # NOTE: inplace
+            outputs['CacheKVOut'] = cache_kvs
+
+        helper.append_op(
+            type='fused_multi_transformer',
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs)
+
+        return (final_out, cache_kvs) if cache_kvs else final_out
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index d38e8d1193bef..d76b990958c94 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -22,6 +22,20 @@
 import collections
 
 
+# for distributed tensor model parallel
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
 class FusedMultiHeadAttention(Layer):
     """
     Attention mapps queries and a set of key-value pairs to outputs, and
@@ -608,3 +622,390 @@ def __init__(self,
 
     def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
         raise NotImplementedError()
+
+
+class FusedMultiTransformer(Layer):
+    """
+    FusedMultiTransformer is composed of multi transformer layers which contains two
+    sub-layers which are self (multi-head) attention and feedforward network. The
+    function of one transformer layer is consistent with the following pseudo code:
+
+    .. code-block:: python
+
+        if pre_layer_norm:
+            out = layer_norm(x)
+            out = qkv_linear(out) + qkv_bias
+        else:
+            out = qkv_linear(x) + qkv_bias
+        out = transpose(out, perm=[2, 0, 3, 1, 4])
+        # extract q, k and v from out.
+        q = out[0:1, ::]
+        k = out[1:2, ::]
+        v = out[2:3, ::]
+        out = q * k^t
+        out = attn_mask + out
+        out = softmax(out)
+        out = dropout(out)
+        out = out * v
+        out = transpose(out, perm=[0, 2, 1, 3])
+        out = linear(out)
+        if pre_layer_norm:
+            out = x + dropout(out + bias)
+        else:
+            out = layer_norm(x + dropout(out + bias))
+
+        residual = out;
+        if pre_layer_norm:
+            out = ffn_layer_norm(out)
+        out = ffn1_linear(out)
+        out = dropout(activation(out + ffn1_bias))
+        out = ffn2_linear(out)
+        out = residual + dropout(out + ffn2_bias)
+        if not pre_layer_norm:
+            out = ffn_layer_norm(out)
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention(MHA).
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout_rate (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.0
+        activation (str, optional): The activation function in the feedforward
+            network. Default "gelu".
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default True
+        ln_scale_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
+            for Attention layer_norm. For Attention layer_norm weight, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
+            for Attention layer_norm. For Attention layer_norm bias, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        qkv_weight_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
+            for Attention qkv computation. For Attention qkv weight, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        qkv_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
+            for Attention qkv computation. For Attention qkv bias, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear_weight_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
+            for Attention linear. For Attention linear weight, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
+            for Attention linear computation. For Attention linear bias, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ffn_ln_scale_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
+            for FFN layer_norm. For FFN layer_norm weight, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ffn_ln_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
+            for FFN layer_norm. For FFN layer_norm bias, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ffn1_weight_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
+            for FFN first linear. For FFN first linear weight, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ffn1_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
+            for FFN first linear. For FFN first linear bias, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ffn2_weight_attrs(ParamAttr|list|tuple, optional): To specify the weight parameter property
+            for FFN second linear. For FFN second linear weight, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ffn2_bias_attrs(ParamAttr|list|tuple|bool, optional): To specify the bias parameter property
+            for FFN second linear. For FFN second linear bias, if it is a list/tuple, `attrs[0]`
+            would be used as `attr` for transformer layer 0, and `attrs[1]` would be used as
+            `attr` for transformer layer 1，etc. Otherwise, all layers both use it as
+            `attr` to create parameters. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        epsilon (float, optional): Small float value added to denominator of the layer_norm to
+            avoid dividing by zero. Default: 1e-05.
+        num_layers (int, optional): The number of layers of the transformer. If `qkv_weight_attrs`
+            is a list or tuple, the number of layers is obtained from `qkv_weight_attrs`. num_layers
+            only takes effect when `qkv_weight_attrs` is not a list or tuple. Default: -1.
+        nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using mp.
+        ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using mp.
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            from paddle.incubate.nn import FusedMultiTransformer
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, 1, src_len, src_len]
+            attn_mask = paddle.rand((2, 1, 4, 4))
+            encoder_layers = FusedMultiTransformer(128, 2, 512, num_layers=1)
+            enc_output = encoder_layers(enc_input, attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dim_feedforward,
+                 dropout_rate=0.0,
+                 activation="gelu",
+                 normalize_before=True,
+                 ln_scale_attrs=None,
+                 ln_bias_attrs=None,
+                 qkv_weight_attrs=None,
+                 qkv_bias_attrs=None,
+                 linear_weight_attrs=None,
+                 linear_bias_attrs=None,
+                 ffn_ln_scale_attrs=None,
+                 ffn_ln_bias_attrs=None,
+                 ffn1_weight_attrs=None,
+                 ffn1_bias_attrs=None,
+                 ffn2_weight_attrs=None,
+                 ffn2_bias_attrs=None,
+                 epsilon=1e-5,
+                 num_layers=-1,
+                 nranks=1,
+                 ring_id=-1,
+                 name=None):
+        super(FusedMultiTransformer, self).__init__()
+
+        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+                               "but recieved {}".format(embed_dim))
+        assert num_heads > 0, ("Expected nhead to be greater than 0, "
+                               "but recieved {}".format(num_heads))
+        assert dim_feedforward > 0, (
+            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            format(dim_feedforward))
+
+        self.normalize_before = normalize_before
+        self._dtype = self._helper.get_default_dtype()
+        self._epsilon = epsilon
+        self._ring_id = ring_id
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+
+        # tensor model parallel
+        if nranks > 1:
+            assert ring_id != -1
+        assert num_heads % nranks == 0
+        assert dim_feedforward % nranks == 0
+        num_heads = num_heads // nranks
+        dim_feedforward = dim_feedforward // nranks
+        self._dim_feedforward = dim_feedforward
+
+        if isinstance(qkv_weight_attrs, (list, tuple)):
+            num_layers = len(qkv_weight_attrs)
+        assert num_layers > 0
+
+        self.ln_scales, self.ln_biases = [], []
+        self.qkv_weights, self.qkv_biases = [], []
+        self.linear_weights, self.linear_biases = [], []
+        self.ffn_ln_scales, self.ffn_ln_biases = [], []
+        self.ffn1_weights, self.ffn1_biases = [], []
+        self.ffn2_weights, self.ffn2_biases = [], []
+
+        def get_attr(attrs, idx):
+            if isinstance(attrs, (list, tuple)):
+                assert len(attrs) == num_layers
+                return attrs[idx]
+            return attrs
+
+        for i in range(num_layers):
+            ln_scale_attr = get_attr(ln_scale_attrs, i)
+            ln_bias_attr = get_attr(ln_bias_attrs, i)
+            qkv_weight_attr = get_attr(qkv_weight_attrs, i)
+            qkv_bias_attr = get_attr(qkv_bias_attrs, i)
+            linear_weight_attr = get_attr(linear_weight_attrs, i)
+            linear_bias_attr = get_attr(linear_bias_attrs, i)
+
+            ffn_ln_scale_attr = get_attr(ffn_ln_scale_attrs, i)
+            ffn_ln_bias_attr = get_attr(ffn_ln_bias_attrs, i)
+            ffn1_weight_attr = get_attr(ffn1_weight_attrs, i)
+            ffn1_bias_attr = get_attr(ffn1_bias_attrs, i)
+            ffn2_weight_attr = get_attr(ffn2_weight_attrs, i)
+            ffn2_bias_attr = get_attr(ffn2_bias_attrs, i)
+
+            ln_scale = self.create_parameter(
+                attr=ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True)
+            qkv_weight = self.create_parameter(
+                shape=[3, num_heads, self.head_dim, embed_dim],
+                attr=qkv_weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+            qkv_bias = self.create_parameter(
+                shape=[3, num_heads, self.head_dim],
+                attr=qkv_bias_attr,
+                dtype=self._dtype,
+                is_bias=True)
+            linear_weight = self.create_parameter(
+                shape=[num_heads * self.head_dim, embed_dim],
+                attr=linear_weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+            linear_bias = self.create_parameter(
+                shape=[embed_dim],
+                attr=linear_bias_attr,
+                dtype=self._dtype,
+                is_bias=True)
+
+            ffn_ln_scale = self.create_parameter(
+                shape=[embed_dim],
+                attr=ffn_ln_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            ffn_ln_bias = self.create_parameter(
+                shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True)
+            ffn1_weight = self.create_parameter(
+                shape=[embed_dim, dim_feedforward],
+                attr=ffn1_weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+            ffn1_bias = self.create_parameter(
+                shape=[dim_feedforward],
+                attr=ffn1_bias_attr,
+                dtype=self._dtype,
+                is_bias=True)
+            ffn2_weight = self.create_parameter(
+                shape=[dim_feedforward, embed_dim],
+                attr=ffn2_weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+            ffn2_bias = self.create_parameter(
+                shape=[embed_dim],
+                attr=ffn2_bias_attr,
+                dtype=self._dtype,
+                is_bias=True)
+
+            # tensor model parallel
+            if nranks > 1:
+                # column parallel
+                _set_var_distributed(qkv_weight)
+                _set_var_distributed(qkv_bias)
+                _set_var_distributed(ffn1_weight)
+                _set_var_distributed(ffn1_bias)
+                # row parallel
+                _set_var_distributed(linear_weight)
+                _set_var_distributed(ffn2_weight)
+
+            self.ln_scales.append(ln_scale)
+            self.ln_biases.append(ln_bias)
+            self.qkv_weights.append(qkv_weight)
+            self.qkv_biases.append(qkv_bias)
+            self.linear_weights.append(linear_weight)
+            self.linear_biases.append(linear_bias)
+
+            self.ffn_ln_scales.append(ffn_ln_scale)
+            self.ffn_ln_biases.append(ffn_ln_bias)
+            self.ffn1_weights.append(ffn1_weight)
+            self.ffn1_biases.append(ffn1_bias)
+            self.ffn2_weights.append(ffn2_weight)
+            self.ffn2_biases.append(ffn2_bias)
+
+        self.dropout_rate = dropout_rate
+        self.activation = activation
+        self.name = name
+
+    def forward(self, src, attn_mask=None, caches=None, time_step=None):
+        """
+        Applies multi transformer layers on the input.
+
+        Parameters:
+            src (Tensor): The input of Transformer layers. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float16 or float32.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                `[batch_size, 1, sequence_length, sequence_length]`. It can be
+                None when nothing wanted or needed to be prevented attention to.
+                Default None.
+            caches (list(Tensor)|tuple(Tensor), optional): The cache structure
+                tensors for the inference generation model. It is only used for
+                inference and should be None for training. The shape is
+                `[2, batch_size, num_head, max_seq_len, head_dim]`. Default None.
+            time_step (Tensor, optional): The time step tensor for the generation
+                model. Which used in decode stage, to represent the time step,
+                that is, the real seq_len of CacheKV. The shape is `[1]`, must be
+                in CPUPlace. Default None.
+
+        Returns:
+            Tensor|tuple: If `caches` is None, return a tensor that has
+            the same shape and data type with `src`, representing the output
+            of Transformer layers. If `caches` is not None, return the
+            tuple (output, caches), which output is the output of
+            Transformer layers, caches is inplace with input `caches`.
+        """
+
+        if caches is not None:
+            assert len(caches) == len(self.qkv_weights)
+        out = incubate_f.fused_multi_transformer(
+            src,
+            self.ln_scales,
+            self.ln_biases,
+            self.qkv_weights,
+            self.qkv_biases,
+            self.linear_weights,
+            self.linear_biases,
+            self.ffn_ln_scales,
+            self.ffn_ln_biases,
+            self.ffn1_weights,
+            self.ffn1_biases,
+            self.ffn2_weights,
+            self.ffn2_biases,
+            pre_layer_norm=self.normalize_before,
+            epsilon=self._epsilon,
+            cache_kvs=caches,
+            time_step=time_step,
+            attn_mask=attn_mask,
+            dropout_rate=self.dropout_rate,
+            activation=self.activation,
+            training=self.training,
+            mode='upscale_in_train',
+            ring_id=self._ring_id,
+            name=self.name)
+        return out
diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py
index cba1d4863cbd4..4ddcfbac8791f 100644
--- a/python/paddle/incubate/operators/resnet_unit.py
+++ b/python/paddle/incubate/operators/resnet_unit.py
@@ -34,7 +34,6 @@
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid.param_attr import ParamAttr
 from paddle import _C_ops
-__all__ = ['resnet_unit', 'ResNetUnit']
 
 
 def resnet_unit(x, filter_x, scale_x, bias_x, mean_x, var_x, z, filter_z,
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index 74b5398230dee..4d40a477ffc07 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -38,6 +38,7 @@ def __init__(self,
                  is_grad_scaled_by_nranks=True,
                  alignment=128,
                  use_master_param_norm=True,
+                 gradient_accumulation_steps=1,
                  name=None):
         assert not framework._non_static_mode(
         ), "DistributedFusedLamb does not support dygraph mode"
@@ -63,6 +64,9 @@ def __init__(self,
         self._scale = None
         self._ring_id = 0
         self._use_master_param_norm = use_master_param_norm
+        self._gradient_accumulation_steps = gradient_accumulation_steps
+        assert self._gradient_accumulation_steps >= 1
+
         self.helper = LayerHelper('distributed_fused_lamb')
         self._supports_check_nan_inf = True  # very import flag for AMP
 
@@ -73,8 +77,19 @@ def __init__(self,
             dtype=core.VarDesc.VarType.BOOL)
         self._step = None
 
+        if self._gradient_accumulation_steps > 1:
+            self._stop_update = main_block.create_var(
+                name=unique_name.generate('stop_update'),
+                shape=[1],
+                dtype=core.VarDesc.VarType.BOOL)
+        else:
+            self._stop_update = None
+
         self._param_to_master_param = {}
 
+    def _get_stop_update_var(self):
+        return self._stop_update if self._stop_update is not None else False
+
     def _set_step(self, step):
         self._step = step
 
@@ -194,6 +209,20 @@ def _apply_gradients_impl(self, params_grads):
         param_order = self._create_persistable_var('param_order', dtype='int32')
         param_order.is_distributed = True
 
+        if self._gradient_accumulation_steps > 1:
+            fp32_acc_fused_grad = [
+                self._create_persistable_var('fp32_acc_fused_grad')
+            ]
+            fp16_acc_fused_grad = [
+                self._create_persistable_var(
+                    'fp16_acc_fused_grad', dtype='float16')
+            ]
+            acc_step = [self._create_persistable_var('acc_step', dtype='int64')]
+        else:
+            fp32_acc_fused_grad = []
+            fp16_acc_fused_grad = []
+            acc_step = []
+
         step = self._get_or_create_step()
 
         rank = get_rank()
@@ -298,6 +327,11 @@ def _apply_gradients_impl(self, params_grads):
                 'ParamOut': params,
                 'GradOut': grads,
                 'FoundInf': [self._found_inf],
+                'FP32AccFusedGrad': fp32_acc_fused_grad,
+                'FP16AccFusedGrad': fp16_acc_fused_grad,
+                'AccStep': acc_step,
+                'StopUpdate': self._stop_update
+                if self._stop_update is not None else [],
                 'Step': [step],
             },
             attrs={
@@ -311,5 +345,6 @@ def _apply_gradients_impl(self, params_grads):
                 'ring_id': self._ring_id,
                 'use_master_param_norm': self._use_master_param_norm,
                 'is_grad_scaled_by_nranks': self._is_grad_scaled_by_nranks,
+                'acc_steps': self._gradient_accumulation_steps,
             })
         return [lamb_op]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 70e3518a1af46..bceee4b964a33 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -138,6 +138,7 @@
 from .layer.distance import PairwiseDistance  # noqa: F401
 
 from .layer.vision import PixelShuffle  # noqa: F401
+from .layer.vision import PixelUnshuffle  # noqa: F401
 from .layer.vision import ChannelShuffle  # noqa: F401
 from .layer.container import LayerDict  # noqa: F401
 
@@ -301,6 +302,7 @@ def weight_norm(*args):
            'Swish',
            'Mish',
            'PixelShuffle',
+           'PixelUnshuffle',
            'ChannelShuffle',
            'ELU',
            'ReLU6',
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 58251c2890430..68213d831c550 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -114,6 +114,7 @@
 from .vision import affine_grid  # noqa: F401
 from .vision import grid_sample  # noqa: F401
 from .vision import pixel_shuffle  # noqa: F401
+from .vision import pixel_unshuffle  # noqa: F401
 from .vision import channel_shuffle  # noqa: F401
 from .input import one_hot  # noqa: F401
 from .input import embedding  # noqa: F401
@@ -214,6 +215,7 @@
            'grid_sample',
            'local_response_norm',
            'pixel_shuffle',
+           'pixel_unshuffle',
            'channel_shuffle',
            'embedding',
            'gather_tree',
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 34acbfbf75463..e64efda7b33bf 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -112,7 +112,10 @@ def elu(x, alpha=1.0, name=None):
             #  [ 1.          15.6      ]]
     """
 
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_elu(x, alpha)
+
+    if _in_legacy_dygraph():
         return _C_ops.elu(x, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 907fd4e6252c6..fe37b8fb97c3d 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1633,14 +1633,14 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
             #[[[0.03333334 0.93333334 0.03333334]
             #  [0.93333334 0.03333334 0.93333334]]]
     """
+    if epsilon > 1. or epsilon < 0.:
+        raise ValueError("The value of epsilon must be between 0 and 1.")
+
     if in_dygraph_mode():
         return _C_ops.final_state_label_smooth(label, prior_dist,
                                                float(epsilon))
 
-    if epsilon > 1. or epsilon < 0.:
-        raise ValueError("The value of epsilon must be between 0 and 1.")
-
-    if paddle.in_dynamic_mode():
+    elif paddle.in_dynamic_mode():
         return _C_ops.label_smooth(label, prior_dist, 'epsilon', float(epsilon))
 
     check_variable_and_dtype(label, 'label', ['float32', 'float64'],
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 07e68d71dc1f1..9a9c2ee4cf7d1 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -347,6 +347,64 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
     return out
 
 
+def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
+    """
+    This API implements pixel unshuffle operation.
+    See more details in :ref:`api_nn_vision_PixelUnshuffle` .
+
+    Parameters:
+        x (Tensor): 4-D tensor, the data type should be float32 or float64.
+        downscale_factor (int): Factor to decrease spatial resolution.
+        data_format (str): The data format of the input and output data. An optional string of NCHW or NHWC. The default is NCHW. When it is NCHW, the data is stored in the order of [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Out (Tensor): Reshaped tensor according to the new dimension.
+
+    Examples:
+        .. code-block:: python
+            :name: pixel_unshuffle-example
+
+            import paddle
+            import paddle.nn.functional as F
+            x = paddle.randn([2, 1, 12, 12])
+            out = F.pixel_unshuffle(x, 3)
+            # out.shape = [2, 9, 4, 4]
+    """
+    if len(x.shape) != 4:
+        raise ValueError(
+            "Input x should be 4D tensor, but received x with the shape of {}".
+            format(x.shape))
+
+    if not isinstance(downscale_factor, int):
+        raise TypeError("Downscale factor must be int type")
+
+    if downscale_factor <= 0:
+        raise ValueError("Downscale factor must be positive")
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'."
+                         "But recevie Attr(data_format): {} ".format(
+                             data_format))
+
+    if _non_static_mode():
+        return _C_ops.pixel_unshuffle(x, "downscale_factor", downscale_factor,
+                                      "data_format", data_format)
+
+    helper = LayerHelper("pixel_unshuffle", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pixel_unshuffle')
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="pixel_unshuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={
+            "downscale_factor": downscale_factor,
+            "data_format": data_format
+        })
+    return out
+
+
 def channel_shuffle(x, groups, data_format="NCHW", name=None):
     """
     This API implements channel shuffle operation.
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index c7cb1052d2f78..9c84b01ecb9af 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -168,14 +168,22 @@ def __call__(self, var, block=None):
                 idx_list.append(offset)
         if framework.in_dygraph_mode():
             with fluid.dygraph.no_grad():
-                tmp_out = _C_ops.reshape(out_var, 'shape', [-1])
+                tmp_out, _ = _C_ops.reshape2(out_var, None, 'shape', [-1])
                 tmp_out._share_underline_tensor_to(out_var)
         else:
+            x_shape = block.create_var(
+                name=unique_name.generate(".".join([out_var.name, "XShape"])),
+                dtype=out_var.dtype,
+                shape=out_var.shape,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=True)
             block.append_op(
-                type="reshape",
+                type="reshape2",
                 inputs={"X": out_var},
                 attrs={'shape': [-1]},
-                outputs={"Out": out_var},
+                outputs={"Out": out_var,
+                         "XShape": x_shape},
                 stop_gradient=True)
 
         index_tensor = block.create_var(
@@ -229,7 +237,8 @@ def __call__(self, var, block=None):
                 tmp_out = _C_ops.final_state_scatter(out_var, index_tensor,
                                                      value_tensor, True)
                 tmp_out._share_underline_tensor_to(out_var)
-                tmp_reshape_out = _C_ops.reshape(out_var, 'shape', origin_shape)
+                tmp_reshape_out, _ = _C_ops.reshape2(out_var, None, 'shape',
+                                                     origin_shape)
                 tmp_reshape_out._share_underline_tensor_to(out_var)
                 if var.dtype != VarDesc.VarType.FP32:
                     tmp_cast_out = _C_ops.cast(out_var, 'in_dtype',
@@ -248,11 +257,19 @@ def __call__(self, var, block=None):
                 attrs={'overwrite': True},
                 outputs={"Out": out_var},
                 stop_gradient=True)
+            x_shape = block.create_var(
+                name=unique_name.generate(".".join([out_var.name, "XShape"])),
+                dtype=out_var.dtype,
+                shape=out_var.shape,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=True)
             block.append_op(
-                type="reshape",
+                type="reshape2",
                 inputs={"X": out_var},
                 attrs={'shape': origin_shape},
-                outputs={"Out": out_var},
+                outputs={"Out": out_var,
+                         "XShape": x_shape},
                 stop_gradient=True)
             if var.dtype != VarDesc.VarType.FP32:
                 block.append_op(
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 339feef8f32e6..31364f0281c8a 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -88,6 +88,7 @@
 from .norm import LocalResponseNorm  # noqa: F401
 
 from .vision import PixelShuffle  # noqa: F401
+from .vision import PixelUnshuffle  # noqa: F401
 from .vision import ChannelShuffle  # noqa: F401
 from .distance import PairwiseDistance  # noqa: F401
 from .container import LayerDict  # noqa: F401
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index b5daa290456e3..ae6e37a02751d 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -37,6 +37,10 @@
 from paddle.framework import core
 from paddle.static import default_startup_program
 from paddle.static import program_guard
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 __all__ = []
 
@@ -197,7 +201,7 @@ def _is_shape_sequence(seq):
             # TODO: Add check for the illegal
             if isinstance(seq, dict):
                 return True
-            return (isinstance(seq, collections.Sequence) and
+            return (isinstance(seq, Sequence) and
                     not isinstance(seq, six.string_types))
 
         class Shape(object):
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index e775d4fcf6dfb..6d5c112d75703 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -89,6 +89,69 @@ def extra_repr(self):
         return main_str
 
 
+class PixelUnshuffle(Layer):
+    """
+    This operator rearranges elements in a tensor of shape :math:`[N, C, H, W]` 
+    to a tensor of shape :math:`[N, r^2C, H/r, W/r]`, or from shape 
+    :math:`[N, H, W, C]` to :math:`[N, H/r, W/r, r^2C]`, where :math:`r` is the 
+    downscale factor. This operation is the reversion of PixelShuffle operation.
+    Please refer to the paper: `Real-Time Single Image and Video Super-Resolution
+    Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
+    by Shi et. al (2016) for more details.
+
+    Parameters:
+        downscale_factor (int): Factor to decrease spatial resolution.
+        data_format (str): The data format of the input and output data. An optional string of NCHW or NHWC. The default is NCHW. When it is NCHW, the data is stored in the order of [batch_size, input_channels, input_height, input_width].
+        name (str, optional): Name for the operation (optional, default is None). Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - **x**: 4-D tensor with shape of :math:`[N, C, H, W]` or :math:`[N, C, H, W]`.
+        - **out**: 4-D tensor with shape of :math:`[N, r^2C, H/r, W/r]` or :math:`[N, H/r, W/r, r^2C]`, where :math:`r` is :attr:`downscale_factor`.
+
+    Examples:
+        .. code-block:: python
+            :name: PixelUnshuffle-example
+
+            import paddle
+            import paddle.nn as nn
+
+            x = paddle.randn([2, 1, 12, 12])
+            pixel_unshuffle = nn.PixelUnshuffle(3)
+            out = pixel_unshuffle(x)
+            # out.shape = [2, 9, 4, 4]
+
+    """
+
+    def __init__(self, downscale_factor, data_format="NCHW", name=None):
+        super(PixelUnshuffle, self).__init__()
+
+        if not isinstance(downscale_factor, int):
+            raise TypeError("Downscale factor must be int type")
+
+        if downscale_factor <= 0:
+            raise ValueError("Downscale factor must be positive")
+
+        if data_format not in ["NCHW", "NHWC"]:
+            raise ValueError("Data format should be 'NCHW' or 'NHWC'."
+                             "But recevie data format: {}".format(data_format))
+
+        self._downscale_factor = downscale_factor
+        self._data_format = data_format
+        self._name = name
+
+    def forward(self, x):
+        return functional.pixel_unshuffle(x, self._downscale_factor,
+                                          self._data_format, self._name)
+
+    def extra_repr(self):
+        main_str = 'downscale_factor={}'.format(self._downscale_factor)
+        if self._data_format != 'NCHW':
+            main_str += ', data_format={}'.format(self._data_format)
+        if self._name is not None:
+            main_str += ', name={}'.format(self._name)
+        return main_str
+
+
 class ChannelShuffle(Layer):
     """
     This operator divides channels in a tensor of shape [N, C, H, W] or [N, H, W, C] into g groups,
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/sparse/__init__.py
index 23ee0c5014aed..93653e09c9019 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/sparse/__init__.py
@@ -20,7 +20,9 @@
 from .layer.conv import Conv3D
 from .layer.conv import SubmConv3D
 
+from .layer.pooling import MaxPool3D
+
 __all__ = [
     'sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'Conv3D', 'SubmConv3D',
-    'BatchNorm'
+    'BatchNorm', 'MaxPool3D'
 ]
diff --git a/python/paddle/sparse/functional/__init__.py b/python/paddle/sparse/functional/__init__.py
index 93c3ccda4a613..f1ca4cc6fcc48 100644
--- a/python/paddle/sparse/functional/__init__.py
+++ b/python/paddle/sparse/functional/__init__.py
@@ -15,5 +15,6 @@
 from .activation import relu  # noqa: F401
 from .conv import conv3d  # noqa: F401
 from .conv import subm_conv3d  # noqa: F401
+from .pooling import max_pool3d  # noqa: F401
 
-__all__ = ['relu', 'conv3d', 'subm_conv3d']
+__all__ = ['relu', 'conv3d', 'subm_conv3d', 'max_pool3d']
diff --git a/python/paddle/sparse/functional/pooling.py b/python/paddle/sparse/functional/pooling.py
new file mode 100644
index 0000000000000..ab5106b31689d
--- /dev/null
+++ b/python/paddle/sparse/functional/pooling.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.layers import utils
+from paddle import _C_ops, in_dynamic_mode
+from paddle.nn.functional.pooling import _update_padding_nd
+
+__all__ = []
+
+
+def max_pool3d(x,
+               kernel_size,
+               stride=None,
+               padding=0,
+               ceil_mode=False,
+               data_format="NDHWC",
+               name=None):
+    """
+    Implements sparse max pooling 3d operation.
+    See more details in :ref:`api_sparse_pooling_MaxPool3d` .
+
+    Args:
+        x (Tensor): The input SparseCooTensor of pooling operator, which is a 5-D tensor with
+                          shape [N, D, H, W, C]. The format of input tensor `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively.
+        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode (bool): ${ceil_mode_comment}
+        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`. Currently only support `"NDHWC"` .
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+    
+    Returns:
+        Tensor: The output tensor of pooling result. The data type is same as input tensor.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.randn((1, 4, 4, 4, 3))
+                sparse_x = dense_x.to_sparse_coo(4)
+                kernel_sizes = [3, 3, 3]
+                paddings = [0, 0, 0]
+                strides = [1, 1, 1]
+                out = paddle.sparse.functional.max_pool3d(sparse_x, kernel_sizes, stride=strides, padding=paddings)
+                #[1, 2, 2, 2, 3]
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+    assert x.is_sparse_coo(
+    ), "Currently, sparse.relu only support the input of SparseCooTensor"
+    assert data_format == 'NDHWC', "Currently, sparse.max_pool3d only support data format of 'NDHWC'"
+
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
+
+    channel_last = True
+
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
+
+    #TODO(zkh2016): remove the dependency on dilation from the backend
+    dilation = [1, 1, 1]
+
+    return _C_ops.final_state_sparse_maxpool(x, kernel_size, padding, dilation,
+                                             stride)
diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/sparse/layer/__init__.py
index ee32e5027b50f..3a6d99392e4e8 100644
--- a/python/paddle/sparse/layer/__init__.py
+++ b/python/paddle/sparse/layer/__init__.py
@@ -16,5 +16,6 @@
 from .norm import BatchNorm
 from .conv import Conv3D
 from .conv import SubmConv3D
+from .pooling import MaxPool3D
 
 __all__ = []
diff --git a/python/paddle/sparse/layer/pooling.py b/python/paddle/sparse/layer/pooling.py
new file mode 100644
index 0000000000000..9cfe463eed577
--- /dev/null
+++ b/python/paddle/sparse/layer/pooling.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.nn import Layer
+from .. import functional as F
+
+
+class MaxPool3D(Layer):
+    """
+    This operation applies 3D max pooling over input features based on the sparse input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NDHWC format, where N is batch size, C is the number of channels,
+    H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
+
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If the kernel size
+            is a tuple or list, it must contain three integers,
+            (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
+            it must contain three integers, [stride_Depth, stride_Height, stride_Width).
+            Otherwise, the pool stride size will be a cube of an int.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
+            1. A string in ['valid', 'same'].
+            2. An int, which means the feature map is zero padded by size of `padding` on every sides.
+            3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
+            4. A list[int] or tuple(int) whose length is \6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            The default value is 0.
+        ceil_mode(bool, optional): ${ceil_mode_comment}
+        return_mask(bool, optional): Whether to return the max indices along with the outputs.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_depth, input_height, input_width]`. Currently, only support "NDHWC".
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
+
+    Returns:
+        A callable object of MaxPool3D.
+
+    Shape:
+        - x(Tensor): The input SparseCooTensor of max pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool3d  operator, which is a 5-D tensor.
+          The data type is same as input x.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.randn((2, 3, 6, 6, 3))
+                sparse_x = dense_x.to_sparse_coo(4)
+                max_pool3d = paddle.sparse.MaxPool3D(
+                    kernel_size=3, data_format='NDHWC')
+                out = max_pool3d(sparse_x)
+                #shape=[2, 1, 2, 2, 3]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 return_mask=False,
+                 ceil_mode=False,
+                 data_format="NDHWC",
+                 name=None):
+        super(MaxPool3D, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.return_mask = return_mask
+        self.ceil_mode = ceil_mode
+        self.data_format = data_format
+        self.name = name
+
+    def forward(self, x):
+        return F.max_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            data_format=self.data_format,
+            name=self.name)
+
+    def extra_repr(self):
+        return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
+            **self.__dict__)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index aeec256bc1580..a5a4df6571b77 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -827,6 +827,11 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         end = start
         start = 0
 
+    out_shape = None
+    if not isinstance(start, Variable) and not isinstance(
+            end, Variable) and not isinstance(step, Variable):
+        out_shape = [int(math.ceil((end - start) / step))]
+
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
@@ -857,11 +862,6 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         out.stop_gradient = True
         return out
 
-    out_shape = None
-    if not isinstance(start, Variable) and not isinstance(
-            end, Variable) and not isinstance(step, Variable):
-        out_shape = [int(math.ceil((end - start) / step))]
-
     check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
                 'range/arange')
     helper = LayerHelper('range', **locals())
@@ -873,6 +873,8 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
                 'Step': step},
         outputs={'Out': out})
     out.stop_gradient = True
+    if out_shape is not None:
+        out.desc.set_shape(out_shape)
     return out
 
 
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 06c2a82fd696d..713a611f9f39a 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -20,6 +20,14 @@
 from .manipulation import squeeze, unsqueeze, reshape
 from .math import multiply
 from .math import sum as paddle_sum
+from ..fluid.framework import _in_legacy_dygraph
+from paddle import _C_ops
+from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+import collections
+import string
+import opt_einsum
 
 from paddle.common_ops_import import dygraph_only
 
@@ -660,6 +668,157 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
     return plan
 
 
+def preprocess(equation, *operands):
+    """
+    check equation / raise error, default right labels generation
+    """
+    equation = equation.replace(" ", "")
+    nop = len(operands)
+    assert nop > 0, "Required at least one operand in Einsum API, but received %s " % nop
+
+    # Part the equation to left hand side and right hand side
+    lhs, *rhs = equation.lower().split('->')
+    assert len(rhs) < 2, "Invalid equation: multiple `->` were found."
+
+    labels = parse_labels(lhs, operands)
+    # Note, we distinguish between 'ij->' and 'ij' by setting rhs to '' and None
+    rhs = rhs[0] if rhs else None
+    if rhs is None:
+        rhs = rhs_inference(lhs)
+
+    assert len(lhs.split(',')) == len(operands), (
+        f"Invalid equation: the number of operands is {len(operands)}, "
+        f"but found {len(lhs.split(','))} segments in the label equation.")
+
+    assert not ('...' in lhs and '...' not in rhs
+                ), f'Invalid equation: missing ellipsis in output labels.'
+
+    assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0
+                ), f'Duplicate labels are not supported.'
+
+    assert not has_duplicated_labels(
+        rhs), f'Invalid equation: duplicate output labels are found.'
+
+    return lhs, rhs, labels
+
+
+def parse_fake_shape(equation, operands, labels):
+    """ 
+    this shape is just used for operands planning. may differ with the original shape.
+    for example: 
+    ... is replaced by 1
+    -1  is replaced by 1
+    Results
+    -------
+    list of shape
+    """
+    shaped = collections.namedtuple('shaped', ['shape'])
+
+    def fake_shape(label, op):
+        assert len(op.shape) == len(
+            label
+        ), "length of shape and length of label must be the same, but received %d != %d" % (
+            len(op.shape), len(label))
+        fakes = [s for i, (l, s) in enumerate(zip(label, op.shape)) if l != '.']
+        fakes = list(map(abs, fakes))  # make -1 -> 1
+        if '.' in label:
+            fakes.insert(label.index('.'), 1)
+        return shaped(fakes)
+
+    out = list(map(fake_shape, labels, operands))
+    return out
+
+
+def rhs_inference(lhs):
+    def is_free(key):
+        return cnt.get(key) == 1 and key not in ['.', ',']
+
+    cnt = collections.Counter(lhs)
+    rhs = "..." if '...' in lhs else ""
+    rhs = rhs + "".join(filter(is_free, sorted(cnt.elements())))
+    return rhs
+
+
+def gen_equation_for_opteinsum(lhs, rhs):
+    """ 
+    1. gen rhs if rhs is None
+    2. '...' -> 'A'
+    """
+
+    def get_used_label(counter):
+        used = set(counter.elements())
+        for c in string.ascii_lowercase:
+            if c not in used: return c
+        raise ValueError(
+            "You have used all `a` - `z`, there can't find a unused for einsum optimization"
+        )
+
+    cnt = collections.Counter(lhs)
+    broadcast_label = get_used_label(cnt)
+    if rhs is None:
+        rhs = rhs_inference(lhs)
+    lhs = lhs.replace("...", broadcast_label)
+    rhs = rhs.replace("...", broadcast_label)
+    return lhs + "->" + rhs, broadcast_label
+
+
+def einsum_v2(equation, *operands):
+    """ 
+    einsum v2 implementation.
+    1. Implement C++ EinsumOp.
+    2. V2 create the EinsumOp to calculate, so just a little verifty work in python.
+    3. V2 use opt_einsum.contract_path to optimize the multivariable einsum.
+    """
+    n_op = len(operands)
+    lhs, rhs, labels = preprocess(equation, *operands)
+
+    if n_op <= 2:
+        return gen_einsum_op(lhs + '->' + rhs, *operands)
+
+    shapes = parse_fake_shape(lhs, operands, labels)
+    opt_equation, broadcast_label = gen_equation_for_opteinsum(lhs, rhs)
+    _, cons = opt_einsum.contract_path(opt_equation, *shapes, einsum_call=True)
+    var_list = list(operands)
+    for path in cons:
+        (a, b), _, eq, *__ = path
+        assert a > b, "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
+        var_s = [var_list.pop(a), var_list.pop(b)]
+        eq = eq.replace(broadcast_label, "...")
+        var_list.append(gen_einsum_op(eq, *var_s))
+    assert len(
+        var_list
+    ) == 1, "There must be one elements in list, but received %d." % len(
+        var_list)
+    return var_list[0]
+
+
+def gen_einsum_op(equation, *operands):
+    """ 
+    EinsumOp Python Interface: 
+    """
+    assert len(operands) <= 2, "Only support two operands in EinsumOp."
+    if in_dygraph_mode():
+        return _C_ops.final_state_einsum(operands, equation)
+
+    if _in_legacy_dygraph():
+        # dygraph
+        return _C_ops.einsum(operands, 'equation', equation)
+    # static graph 
+    for inp in operands:
+        check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
+    check_type(equation, 'equation', str, 'einsum')
+    helper = LayerHelper('einsum', **locals())
+    out = helper.create_variable_for_type_inference(dtype=operands[0].dtype)
+    attrs = dict()
+    attrs['equation'] = equation
+    helper.append_op(
+        type='einsum',
+        inputs={'Operands': operands},
+        outputs={'Out': out},
+        attrs=attrs, )
+    return out
+
+
 def einsum(equation, *operands):
     r"""
     einsum(equation, *operands)
@@ -817,6 +976,9 @@ def einsum(equation, *operands):
         #     [0.50226176, 0.24512935, 0.39881429],
         #     [0.51476848, 0.23367381, 0.39229113]]])
     """
+    import os
+    if int(os.environ.get('FLAGS_new_einsum', "0")):
+        return einsum_v2(equation, *operands)
 
     nop = len(operands)
     assert nop > 0, "At least one operand is expected."
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index b0e0082c6d9c4..127aa71137dff 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -45,9 +45,9 @@ def cast(x, dtype):
     equals the input dtype, but it's fine if you do so.
 
     Args:
-        x(Tensor): An input N-D Tensor with data type bool, float16,
+        x (Tensor): An input N-D Tensor with data type bool, float16,
             float32, float64, int32, int64, uint8.
-        dtype(np.dtype|str): Data type of the output:
+        dtype (np.dtype|str): Data type of the output:
             bool, float16, float32, float64, int8, int32, int64, uint8.
 
     Returns:
@@ -601,8 +601,7 @@ def crop(x, shape=None, offsets=None, name=None):
             Tensor. When it is a list, each element can be an integer or a Tensor of shape: [1].
             If Variable contained, it is suitable for the case that the offsets may be changed
             each iteration. Default: None, the offsets are 0 at each dimension.
-        name(str, optional): The default value is None. Normally there is no need for user to set
-            this property. For more information, please refer to :ref:`api_guide_Name` .
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The cropped Tensor has same data type with `x`.
@@ -742,8 +741,8 @@ def fill_(x, value):
     This function fill the Tensor with value inplace.
 
     Args:
-        x(Tensor): ``x`` is the Tensor we want to filled data inplace
-        value(Scale): ``value`` is the value to be filled in x
+        x (Tensor): ``x`` is the Tensor we want to filled data inplace
+        value (Scale): ``value`` is the value to be filled in x
 
     Returns:
         x(Tensor): Tensor x filled with value inplace
@@ -776,10 +775,10 @@ def zero_(x):
     This function fill the Tensor with zero inplace.
 
     Args:
-        x(Tensor): ``x`` is the Tensor we want to filled with zero inplace
+        x (Tensor): ``x`` is the Tensor we want to filled with zero inplace
 
     Returns:
-        x(Tensor): Tensor x filled with zero inplace
+        x (Tensor): Tensor x filled with zero inplace
 
     Examples:
         .. code-block:: python
@@ -798,19 +797,21 @@ def zero_(x):
 @dygraph_only
 def fill_diagonal_(x, value, offset=0, wrap=False, name=None):
     """
-    **Notes**:
-        **This API is ONLY available in Dygraph mode**
+    Note:
+        This API is ONLY available in Dygraph mode.
+	
     This function fill the value into the x Tensor's diagonal inplace.
+    
     Args:
         x(Tensor): ``x`` is the original Tensor
         value(Scale): ``value`` is the value to filled in x
         offset(int,optional): the offset to the main diagonal. Default: 0 (main diagonal).
         wrap(bool,optional): the diagonal 'wrapped' after N columns for tall matrices.
         name(str,optional): Name for the operation (optional, default is None)
+    
     Returns:
         Tensor: Tensor with diagonal filled with value.
-    Returns type:
-        dtype is same as x Tensor
+
     Examples:
         .. code-block:: python
             import paddle
@@ -874,25 +875,22 @@ def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
 
 def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None):
     """
-    **Notes**:
-        **This API is ONLY available in Dygraph mode**
+    Note:
+        This API is ONLY available in Dygraph mode.
 
     This function fill the source Tensor y into the x Tensor's diagonal inplace.
 
     Args:
-        x(Tensor): ``x`` is the original Tensor
-        y(Tensor): ``y`` is the Tensor to filled in x
-        dim1(int,optional): first dimension with respect to which to fill diagonal. Default: 0.
-        dim2(int,optional): second dimension with respect to which to fill diagonal. Default: 1.
-        offset(int,optional): the offset to the main diagonal. Default: 0 (main diagonal).
-        name(str,optional): Name for the operation (optional, default is None)
+        x (Tensor): ``x`` is the original Tensor
+        y (Tensor): ``y`` is the Tensor to filled in x
+        dim1 (int,optional): first dimension with respect to which to fill diagonal. Default: 0.
+        dim2 (int,optional): second dimension with respect to which to fill diagonal. Default: 1.
+        offset (int,optional): the offset to the main diagonal. Default: 0 (main diagonal).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Tensor with diagonal filled with y.
 
-    Returns type:
-        list: dtype is same as x Tensor
-
     Examples:
         .. code-block:: python
 
@@ -913,19 +911,16 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
     This function fill the source Tensor y into the x Tensor's diagonal.
 
     Args:
-        x(Tensor): ``x`` is the original Tensor
-        y(Tensor): ``y`` is the Tensor to filled in x
-        dim1(int,optional): first dimension with respect to which to fill diagonal. Default: 0.
-        dim2(int,optional): second dimension with respect to which to fill diagonal. Default: 1.
-        offset(int,optional): the offset to the main diagonal. Default: 0 (main diagonal).
-        name(str,optional): Name for the operation (optional, default is None)
+        x (Tensor): ``x`` is the original Tensor
+        y (Tensor): ``y`` is the Tensor to filled in x
+        dim1 (int,optional): first dimension with respect to which to fill diagonal. Default: 0.
+        dim2 (int,optional): second dimension with respect to which to fill diagonal. Default: 1.
+        offset (int,optional): the offset to the main diagonal. Default: 0 (main diagonal).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Tensor with diagonal filled with y.
 
-    Returns type:
-        list: dtype is same as x Tensor
-
     Examples:
         .. code-block:: python
 
@@ -944,19 +939,17 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
 @dygraph_only
 def tolist(x):
     """
-    **Notes**:
-        **This API is ONLY available in Dygraph mode**
+    Note:
+        This API is ONLY available in Dygraph mode.
 
     This function translate the paddle.Tensor to python list.
 
     Args:
-        x(Tensor): ``x`` is the Tensor we want to translate to list
+        x (Tensor): ``x`` is the Tensor we want to translate to list.
 
     Returns:
         list: A list that contain the same value of current Tensor.
 
-    Returns type:
-        list: dtype is same as current Tensor
 
     Examples:
         .. code-block:: python
@@ -980,15 +973,13 @@ def concat(x, axis=0, name=None):
     This OP concatenates the input along the axis.
 
     Args:
-        x(list|tuple): ``x`` is a Tensor list or Tensor tuple which is with data type bool, float16,
+        x (list|tuple): ``x`` is a Tensor list or Tensor tuple which is with data type bool, float16,
             float32, float64, int32, int64, uint8. All the Tensors in ``x`` must have same data type.
-        axis(int|Tensor, optional): Specify the axis to operate on the input Tensors.
+        axis (int|Tensor, optional): Specify the axis to operate on the input Tensors.
             It's a scalar with data type int or a Tensor with shape [1] and data type int32 
             or int64. The effective range is [-R, R), where R is Rank(x). When ``axis < 0``,
             it works the same way as ``axis+R``. Default is 0.
-        name (str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: A Tensor with the same data type as ``x``.
@@ -1097,12 +1088,10 @@ def broadcast_tensors(input, name=None):
         If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
 
     Args:
-        input(list|tuple): ``input`` is a Tensor list or Tensor tuple which is with data type bool,
+        input (list|tuple): ``input`` is a Tensor list or Tensor tuple which is with data type bool,
             float16, float32, float64, int32, int64. All the Tensors in ``input`` must have same data type.
             Currently we only support tensors with rank no greater than 5.
-
-        name (str, optional): The default value is None. Normally there is no need for user to set this property. 
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         list(Tensor): The list of broadcasted tensors following the same order as ``input``.
@@ -1192,8 +1181,7 @@ def flip(x, axis, name=None):
         x (Tensor): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor x
             should be float32, float64, int32, int64, bool.
         axis (list|tuple|int): The axis(axes) to flip on. Negative indices for indexing from the end are accepted.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Tensor or LoDTensor calculated by flip layer. The data type is same with input x.
@@ -3143,20 +3131,19 @@ def reshape(x, shape, name=None):
     the corresponding dimension of x.
 
     Args:
-        x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32``, ``int64`` or ``bool``
-        shape(list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
+        x (Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32``, ``int64`` or ``bool``
+        shape (list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
                         The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
                         If ``shape`` is an Tensor, it should be an 1-D Tensor .
-        name(str, optional): The default value is None. Normally there is no need for user to set this property.
-                            For more information, please refer to :ref:`api_guide_Name` .
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: A reshaped Tensor with the same data type as ``x``.
 
     Examples:
         .. code-block:: python
+           :name: code-example1
 
-            import numpy as np
             import paddle
 
             x = paddle.rand([2, 4, 6], dtype="float32")
@@ -3170,9 +3157,9 @@ def reshape(x, shape, name=None):
             print(out)
             # the shape of out_2 is [4, 12].
 
-            shape_tensor = paddle.to_tensor(np.array([8, 6]).astype("int32"))
+            shape_tensor = paddle.to_tensor([8, 6], dtype=paddle.int32)
             out = paddle.reshape(x, shape=shape_tensor)
-            print(out)
+            print(out.shape)
             # the shape is [8, 6].
             # out shares data with x in dygraph mode
             x[0, 0, 0] = 10.
@@ -4113,14 +4100,12 @@ def take_along_axis(arr, indices, axis):
     
     Examples:
         .. code-block:: python
+           :name: code-example1
 
             import paddle
-            import numpy as np
 
-            x_np = np.array([[1, 2, 3], [4, 5, 6], [7,8,9]])
-            index_np = np.array([[0]])
-            x = paddle.to_tensor(x_np)
-            index = paddle.to_tensor(index_np)
+            x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7,8,9]])
+            index = paddle.to_tensor([[0]])
             axis = 0
             result = paddle.take_along_axis(x, index, axis)
             print(result)
@@ -4180,14 +4165,12 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
     
     Examples:
         .. code-block:: python
+            :name: code-example1
 
             import paddle
-            import numpy as np
 
-            x_np = np.array([[10, 30, 20], [60, 40, 50]])
-            index_np = np.array([[0]])
-            x = paddle.to_tensor(x_np)
-            index = paddle.to_tensor(index_np)
+            x = paddle.to_tensor([[10, 30, 20], [60, 40, 50]])
+            index = paddle.to_tensor([[0]])
             value = 99
             axis = 0
             result = paddle.put_along_axis(x, index, value, axis)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 7e0b2e5424dad..83501b0399492 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -146,12 +146,12 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
                             Out=scale*(X+bias)
 
     Args:
-        x(Tensor): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
-        scale(float|Tensor): The scale factor of the input, it should be a float number or a Tensor with shape [1] and data type as float32.
-        bias(float): The bias to be put on the input.
-        bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
-        act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        x (Tensor): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
+        scale (float|Tensor): The scale factor of the input, it should be a float number or a Tensor with shape [1] and data type as float32.
+        bias (float): The bias to be put on the input.
+        bias_after_scale (bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
+        act (str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Output tensor of scale operator, with shape and data type same as input.
@@ -281,24 +281,23 @@ def multiplex(inputs, index, name=None):
     Args:
         inputs (list): The input Tensor list. The list elements are N-D Tensors of data types float32, float64, int32, int64. All input Tensor shapes should be the same and rank must be at least 2.
         index (Tensor): Used to select some rows in the input Tensor to construct an index of the output Tensor. It is a 2-D Tensor with data type int32 or int64 and shape [M, 1], where M is the number of input Tensors.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
         Tensor: Output of multiplex OP, with data type being float32, float64, int32, int64.
 
     Examples:
 
         .. code-block:: python
+            :name: code-example1
 
             import paddle
-            import numpy as np
-            img1 = np.array([[1, 2], [3, 4]]).astype(np.float32)
-            img2 = np.array([[5, 6], [7, 8]]).astype(np.float32)
-            inputs = [paddle.to_tensor(img1), paddle.to_tensor(img2)]
-            index = paddle.to_tensor(np.array([[1], [0]]).astype(np.int32))
+            img1 = paddle.to_tensor([[1, 2], [3, 4]], dtype=paddle.float32)
+            img2 = paddle.to_tensor([[5, 6], [7, 8]], dtype=paddle.float32)
+            inputs = [img1, img2]
+            index = paddle.to_tensor([[1], [0]], dtype=paddle.int32)
             res = paddle.multiplex(inputs, index)
-            print(res) # [array([[5., 6.], [3., 4.]], dtype=float32)]
+            print(res) # Tensor([[5., 6.], [3., 4.]], dtype=float32)
 
     """
     if _non_static_mode():
@@ -1077,8 +1076,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
             output Tensor. The result Tensor will have one fewer dimension
             than the :attr:`x` unless :attr:`keepdim` is true, default
             value is False.
-        name (str, optional): The default value is None. Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Results of summation operation on the specified axis of input Tensor `x`,
@@ -1134,15 +1132,10 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         else:
             reduce_all_flag = False
 
-    def get_dtype(x, dtype):
-        if dtype is not None:
-            return (True, dtype)
-        src_type = convert_dtype(x.dtype)
-        if src_type in ['bool','int32', 'int64']:
-            return (True, 'int64')
-        return (False, src_type)
-
-    dtype_flag, dtype = get_dtype(x, dtype)
+    dtype_flag = False
+    if dtype is not None:
+        dtype_flag = True
+        dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
         if reduce_all_flag:
@@ -1150,17 +1143,14 @@ def get_dtype(x, dtype):
         else:
             axis = axis if axis != None and axis != [] else [0]
 
-        out_dtype = convert_np_dtype_to_dtype_(dtype)
-        out = _C_ops.final_state_sum(x, axis, out_dtype, keepdim)
-        return out
+        return _C_ops.final_state_sum(x, axis, dtype, keepdim)
 
     if _in_legacy_dygraph():
         axis = axis if axis != None and axis != [] else [0]
         if dtype_flag:
             return _C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag, 'in_dtype',
-                                       x.dtype, 'out_dtype',
-                                       convert_np_dtype_to_dtype_(dtype))
+                                       x.dtype, 'out_dtype', dtype)
         else:
             return _C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag)
@@ -1174,7 +1164,7 @@ def get_dtype(x, dtype):
     if dtype_flag:
         attrs.update({
             'in_dtype': x.dtype,
-            'out_dtype': convert_np_dtype_to_dtype_(dtype)
+            'out_dtype': dtype
         })
 
     check_variable_and_dtype(
@@ -1188,7 +1178,7 @@ def get_dtype(x, dtype):
     helper = LayerHelper('sum', **locals())
     if dtype_flag:
         out = helper.create_variable_for_type_inference(
-            dtype=convert_np_dtype_to_dtype_(dtype))
+            dtype=dtype)
     else:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
@@ -1216,8 +1206,7 @@ def nansum(x, axis=None, dtype=None, keepdim=False, name=None):
             output Tensor. The result Tensor will have one fewer dimension
             than the :attr:`x` unless :attr:`keepdim` is true, default
             value is False.
-        name (str, optional): The default value is None. Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Results of summation operation on the specified axis of input Tensor `x`,
@@ -1368,8 +1357,7 @@ def add_n(inputs, name=None):
     Args:
         inputs (Tensor|list[Tensor]|tuple[Tensor]):  A Tensor or a list/tuple of Tensors. The shape and data type of the list/tuple elements should be consistent.
             Input can be multi-dimensional Tensor, and data types can be: float32, float64, int32, int64.
-        name(str, optional): The default value is None. Normally there is no need for
-            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor, the sum of input :math:`inputs` , its shape and data types are consistent with :math:`inputs`.
@@ -1480,8 +1468,7 @@ def mm(input, mat2, name=None):
     Args:
         input (Tensor): The input tensor which is a Tensor.
         mat2 (Tensor): The input tensor which is a Tensor.
-        name(str, optional): The default value is None. Normally there is no need for
-            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The product Tensor.
@@ -1599,7 +1586,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
         y (Tensor): The second input Tensor for matrix multiplication.
         beta (float): Coefficient of $input$.
         alpha (float): Coefficient of $x*y$.
-        name (str, optional): Name of the output. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The output Tensor of addmm op.
@@ -1727,8 +1714,7 @@ def inner(x, y, name=None):
     Args:
         x (Tensor): An N-D Tensor or a Scalar Tensor. If its not a scalar Tensor, its last dimensions must match y's.
         y (Tensor): An N-D Tensor or a Scalar Tensor. If its not a scalar Tensor, its last dimensions must match x's.
-        name(str, optional): The default value is None. Normally there is no need for
-            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The inner-product Tensor, the output shape is x.shape[:-1] + y.shape[:-1].
@@ -1799,8 +1785,7 @@ def outer(x, y, name=None):
     Args:
         x (Tensor): An N-D Tensor or a Scalar Tensor. 
         y (Tensor): An N-D Tensor or a Scalar Tensor. 
-        name(str, optional): The default value is None. Normally there is no need for
-            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The outer-product Tensor.
@@ -1923,9 +1908,7 @@ def inverse(x, name=None):
             dimensions should be equal. When the number of dimensions is
             greater than 2, it is treated as batches of square matrix. The data
             type can be float32 and float64.
-        name (str, optional): The default value is None. Normally there is no need for
-            user to set this property. For more information,
-            please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: A Tensor holds the inverse of x. The shape and data type
@@ -1989,18 +1972,17 @@ def max(x, axis=None, keepdim=False, name=None):
 
 
     Args:
-        x(Tensor): A tensor, the data type is float32, float64, int32, int64.
-        axis(int|list|tuple, optional): The axis along which the maximum is computed.
+        x (Tensor): A tensor, the data type is float32, float64, int32, int64.
+        axis (int|list|tuple, optional): The axis along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
             `x` and return a Tensor with a single element,
             otherwise must be in the range :math:`[-x.ndim(x), x.ndim(x))`.
             If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
-        keepdim(bool, optional): Whether to reserve the reduced dimension in the
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the `x` unless :attr:`keepdim` is true, default
             value is False.
-        name(str, optional): The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor, results of maximum on the specified axis of input tensor,
@@ -2093,18 +2075,17 @@ def min(x, axis=None, keepdim=False, name=None):
         while min propagates gradient to all of them.
 
     Args:
-        x(Tensor): A tensor, the data type is float32, float64, int32, int64.
-        axis(int|list|tuple, optional): The axis along which the minimum is computed.
+        x (Tensor): A tensor, the data type is float32, float64, int32, int64.
+        axis (int|list|tuple, optional): The axis along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
             `x` and return a Tensor with a single element,
             otherwise must be in the range :math:`[-x.ndim, x.ndim)`.
             If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
-        keepdim(bool, optional): Whether to reserve the reduced dimension in the
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the `x` unless :attr:`keepdim` is true, default
             value is False.
-        name(str, optional): The default value is None.  Normally there is no need for 
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor, results of minimum on the specified axis of input tensor,
@@ -2197,19 +2178,18 @@ def amax(x, axis=None, keepdim=False, name=None):
         while max propagates gradient to all of them.
 
     Args:
-        x(Tensor): A tensor, the data type is float32, float64, int32, int64,
+        x (Tensor): A tensor, the data type is float32, float64, int32, int64,
             the dimension is no more than 4.
-        axis(int|list|tuple, optional): The axis along which the maximum is computed.
+        axis (int|list|tuple, optional): The axis along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
             `x` and return a Tensor with a single element,
             otherwise must be in the range :math:`[-x.ndim(x), x.ndim(x))`.
             If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
-        keepdim(bool, optional): Whether to reserve the reduced dimension in the
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the `x` unless :attr:`keepdim` is true, default
             value is False.
-        name(str, optional): The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor, results of maximum on the specified axis of input tensor,
@@ -2310,19 +2290,18 @@ def amin(x, axis=None, keepdim=False, name=None):
         while min propagates gradient to all of them.
 
     Args:
-        x(Tensor): A tensor, the data type is float32, float64, int32, int64, 
+        x (Tensor): A tensor, the data type is float32, float64, int32, int64, 
             the dimension is no more than 4.
-        axis(int|list|tuple, optional): The axis along which the minimum is computed.
+        axis (int|list|tuple, optional): The axis along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
             `x` and return a Tensor with a single element,
             otherwise must be in the range :math:`[-x.ndim, x.ndim)`.
             If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
-        keepdim(bool, optional): Whether to reserve the reduced dimension in the
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the `x` unless :attr:`keepdim` is true, default
             value is False.
-        name(str, optional): The default value is None.  Normally there is no need for 
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor, results of minimum on the specified axis of input tensor,
@@ -2421,8 +2400,8 @@ def log1p(x, name=None):
 
     Args:
         x (Tensor): Input Tensor. Must be one of the following types: float32, float64.
-        name(str, optional): The default value is None.  Normally there is no need for 
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+        
     Returns:
         Tensor, the natural log of the input Tensor computed element-wise.
 
@@ -2459,7 +2438,7 @@ def log2(x, name=None):
 
     Args:
         x (Tensor): Input tensor must be one of the following types: float32, float64.
-        name (str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
 
     Returns:
@@ -2511,7 +2490,7 @@ def log10(x, name=None):
 
     Args:
         x (Tensor): Input tensor must be one of the following types: float32, float64.
-        name (str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
 
     Returns:
@@ -2568,9 +2547,7 @@ def clip(x, min=None, max=None, name=None):
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         max (float|int|Tensor): The upper bound with type ``float``, ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        name (str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: A Tensor with the same data type and data shape as input.
@@ -2700,11 +2677,11 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
     - Note that if offset is out of input's shape indicated by axis1 and axis2, 0 will be returned.
 
     Args:
-        x(Tensor): The input tensor x. Must be at least 2-dimensional. The input data type should be float32, float64, int32, int64.
-        offset(int, optional): Which diagonals in input tensor x will be taken. Default: 0 (main diagonals).
-        axis1(int, optional): The first axis with respect to take diagonal. Default: 0.
-        axis2(int, optional): The second axis with respect to take diagonal. Default: 1.
-        name (str, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
+        x (Tensor): The input tensor x. Must be at least 2-dimensional. The input data type should be float32, float64, int32, int64.
+        offset (int, optional): Which diagonals in input tensor x will be taken. Default: 0 (main diagonals).
+        axis1 (int, optional): The first axis with respect to take diagonal. Default: 0.
+        axis2 (int, optional): The second axis with respect to take diagonal. Default: 1.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: the output data type is the same as input data type.
@@ -2785,11 +2762,11 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
     - If offset < 0, it is below the main diagonal.
     
     Args:
-        x(Tensor): The input tensor x. Must be at least 2-dimensional. The input data type should be bool, int32, int64, float16, float32, float64.
-        offset(int, optional): Which diagonals in input tensor x will be taken. Default: 0 (main diagonals).
-        axis1(int, optional): The first axis with respect to take diagonal. Default: 0.
-        axis2(int, optional): The second axis with respect to take diagonal. Default: 1.
-        name (str, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
+        x (Tensor): The input tensor x. Must be at least 2-dimensional. The input data type should be bool, int32, int64, float16, float32, float64.
+        offset (int, optional): Which diagonals in input tensor x will be taken. Default: 0 (main diagonals).
+        axis1 (int, optional): The first axis with respect to take diagonal. Default: 0.
+        axis2 (int, optional): The second axis with respect to take diagonal. Default: 1.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: a partial view of input tensor in specify two dimensions, the output data type is the same as input data type.
@@ -2893,9 +2870,7 @@ def kron(x, y, name=None):
         y (Tensor): the second operand of kron op, data type: float16,
             float32, float64, int32 or int64. Its data type should be the same
             with x.
-        name(str, optional): The default value is None.  Normally there is no
-            need for user to set this property.  For more information, please
-            refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The output of kron op, data type: float16, float32, float64, int32 or int64. Its data is the same with x.
@@ -3155,19 +3130,18 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
     Compute the product of tensor elements over the given axis.
 
     Args:
-        x(Tensor): The input tensor, its data type should be float32, float64, int32, int64.
-        axis(int|list|tuple, optional): The axis along which the product is computed. If :attr:`None`, 
+        x (Tensor): The input tensor, its data type should be float32, float64, int32, int64.
+        axis (int|list|tuple, optional): The axis along which the product is computed. If :attr:`None`, 
             multiply all elements of `x` and return a Tensor with a single element, 
             otherwise must be in the range :math:`[-x.ndim, x.ndim)`. If :math:`axis[i]<0`, 
             the axis to reduce is :math:`x.ndim + axis[i]`. Default is None.
-        dtype(str|np.dtype, optional): The desired date type of returned tensor, can be float32, float64, 
+        dtype (str|np.dtype, optional): The desired date type of returned tensor, can be float32, float64, 
             int32, int64. If specified, the input tensor is casted to dtype before operator performed. 
             This is very useful for avoiding data type overflows. The default value is None, the dtype 
             of output is the same as input Tensor `x`.
-        keepdim(bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result 
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result 
             tensor will have one fewer dimension than the input unless `keepdim` is true. Default is False.
-        name(string, optional): The default value is None. Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor, result of product on the specified dim of input tensor.
@@ -3253,9 +3227,8 @@ def sign(x, name=None):
     This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
 
     Args:
-        x(Tensor): The input tensor. The data type can be float16, float32 or float64.
-        name (str, optional): The default value is None. Normally there is no need for user to
-            set this property. For more information, please refer to :ref:`api_guide_Name`
+        x (Tensor): The input tensor. The data type can be float16, float32 or float64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The output sign tensor with identical shape and data type to the input :attr:`x`.
@@ -3338,7 +3311,7 @@ def increment(x, value=1.0, name=None):
 
     Args:
         x (Tensor): A tensor that must always contain only one element, its data type supports float32, float64, int32 and int64.
-        value(float, optional): The amount to increment the data of :attr:`x`. Default: 1.0.
+        value (float, optional): The amount to increment the data of :attr:`x`. Default: 1.0.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -3386,8 +3359,7 @@ def all(x, axis=None, keepdim=False, name=None):
             output Tensor. The result Tensor will have one fewer dimension
             than the :attr:`x` unless :attr:`keepdim` is true, default
             value is False.
-        name (str, optional): The default value is None. Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Results the ``logical and`` on the specified axis of input Tensor `x`,  it's data type is bool.
@@ -3483,8 +3455,7 @@ def any(x, axis=None, keepdim=False, name=None):
             output Tensor. The result Tensor will have one fewer dimension
             than the :attr:`x` unless :attr:`keepdim` is true, default
             value is False.
-        name (str, optional): The default value is None. Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Results the ``logical or`` on the specified axis of input Tensor `x`,  it's data type is bool.
@@ -3599,8 +3570,7 @@ def conj(x, name=None):
     Args:
         x (Tensor): The input tensor which hold the complex numbers. 
             Optional data types are: complex64, complex128, float32, float64, int32 or int64.
-        name (str, optional): The default value is None. Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         out (Tensor): The conjugate of input. The shape and data type is the same with input.
@@ -3645,8 +3615,7 @@ def digamma(x, name=None):
 
     Args:
         x (Tensor): Input Tensor. Must be one of the following types: float32, float64.
-        name(str, optional): The default value is None.  Normally there is no need for 
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
     Returns:
         Tensor, the digamma of the input Tensor, the shape and data type is the same with input.
 
@@ -4201,18 +4170,17 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
     Only n=1 is currently supported.
 
     Args:
-        x(Tensor): The input tensor to compute the forward difference on
-        n(int, optional): The number of times to recursively compute the difference. 
+        x (Tensor): The input tensor to compute the forward difference on
+        n (int, optional): The number of times to recursively compute the difference. 
                           Only support n=1. Default:1
-        axis(int, optional): The axis to compute the difference along. Default:-1
-        prepend(Tensor, optional): The tensor to prepend to input along axis before computing the difference.
+        axis (int, optional): The axis to compute the difference along. Default:-1
+        prepend (Tensor, optional): The tensor to prepend to input along axis before computing the difference.
                                    It's dimensions must be equivalent to that of x, 
                                    and its shapes must match x's shape except on axis.
-        append(Tensor, optional): The tensor to append to input along axis before computing the difference, 
+        append (Tensor, optional): The tensor to append to input along axis before computing the difference, 
                                    It's dimensions must be equivalent to that of x, 
                                    and its shapes must match x's shape except on axis.
-        name(str|None): A name for this layer(optional). If set None, 
-                        the layer will be named automatically.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
     
     Returns:
         Tensor: The output tensor with same dtype with x.
@@ -4292,18 +4260,19 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
         ends_2 = [dim_len]
         attrs_2 += ('ends', ends_2)
         if in_dygraph_mode():
-            input_back = input_front = _C_ops.final_state_slice(new_input, axes, starts_2, ends_2, infer_flags,
+            input_back = _C_ops.final_state_slice(new_input, axes, starts_2, ends_2, infer_flags,
                                             [])
         else:
             input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
                 'infer_flags', infer_flags, *attrs_2)
 
         if x.dtype == paddle.bool:
-            op = getattr(_C_ops, "logical_xor")
-            out = op(input_back, input_front)
+            if in_dygraph_mode():
+                return _C_ops.final_state_logical_xor(input_back, input_front)
+            else:
+                return _C_ops.logical_xor(input_back, input_front)
         else:
-            out = elementwise_sub(input_back, input_front, axis=axis)
-        return out
+            return elementwise_sub(input_back, input_front, axis=axis)
 
     else:
         check_variable_and_dtype(x, 'x', ['float32', 'float64', 'bool', 'int32', 'int64'], 'diff')
@@ -4418,7 +4387,7 @@ def frac(x, name=None):
 
     Args:
         x (Tensor): The input tensor, which data type should be int32, int64, float32, float64.
-        name: (str, optional): Name for operation (optional, default is None). For more
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The output Tensor of frac.
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 974943a99d8b4..242680bc7c738 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -175,6 +175,12 @@ def test_random_crop(self):
         trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
         img = trans_random_crop_pad(img)
 
+    def test_erase(self):
+        trans = transforms.Compose([
+            transforms.RandomErasing(), transforms.RandomErasing(value="random")
+        ])
+        self.do_transform(trans)
+
     def test_grayscale(self):
         trans = transforms.Compose([transforms.Grayscale()])
         self.do_transform(trans)
@@ -299,6 +305,24 @@ def test_exception(self):
         with self.assertRaises(NotImplementedError):
             transform = transforms.BrightnessTransform('0.1', keys='a')
 
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(scale=0.5)
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(ratio=0.8)
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(scale=(10, 0.4))
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(ratio=(3.3, 0.3))
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(prob=1.5)
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(value="0")
+
     def test_info(self):
         str(transforms.Compose([transforms.Resize((224, 224))]))
         str(transforms.Compose([transforms.Resize((224, 224))]))
@@ -355,6 +379,10 @@ def test_normalize(self):
         trans = transforms.Compose([normalize])
         self.do_transform(trans)
 
+    def test_color_jitter(self):
+        trans = transforms.Compose([transforms.ColorJitter(1.1, 2.2, 0.8, 0.1)])
+        self.do_transform(trans)
+
     def test_pad(self):
         trans = transforms.Compose([transforms.Pad(2)])
         self.do_transform(trans)
@@ -398,6 +426,13 @@ def test_random_crop(self):
         trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
         img = trans_random_crop_pad(img)
 
+    def test_erase(self):
+        trans = transforms.Compose([
+            transforms.RandomErasing(value=(0.5, )),
+            transforms.RandomErasing(value="random")
+        ])
+        self.do_transform(trans)
+
     def test_exception(self):
         trans = transforms.Compose([transforms.Resize(-1)])
 
@@ -562,6 +597,59 @@ def test_center_crop(self):
             tensor_cropped_img.numpy().transpose((1, 2, 0)),
             decimal=4)
 
+    def test_color_jitter_sub_function(self):
+        np.random.seed(555)
+        np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img)
+        tensor_img = F.to_tensor(np_img)
+        np_img = pil_img
+
+        np_img_gray = (np.random.rand(28, 28, 1) * 255).astype('uint8')
+        tensor_img_gray = F.to_tensor(np_img_gray)
+
+        places = ['cpu']
+        if paddle.device.is_compiled_with_cuda():
+            places.append('gpu')
+
+        def test_adjust_brightness(np_img, tensor_img):
+            result_cv2 = np.array(F.adjust_brightness(np_img, 1.2))
+            result_tensor = F.adjust_brightness(tensor_img, 1.2).numpy()
+            result_tensor = np.transpose(result_tensor * 255,
+                                         (1, 2, 0)).astype('uint8')
+            np.testing.assert_equal(result_cv2, result_tensor)
+
+        # For adjust_contrast / adjust_saturation / adjust_hue the implement is kind
+        # of different between PIL and Tensor. So the results can not equal exactly.
+
+        def test_adjust_contrast(np_img, tensor_img):
+            result_pil = np.array(F.adjust_contrast(np_img, 0.36))
+            result_tensor = F.adjust_contrast(tensor_img, 0.36).numpy()
+            result_tensor = np.transpose(result_tensor * 255, (1, 2, 0))
+            diff = np.max(np.abs(result_tensor - result_pil))
+            self.assertTrue(diff < 1.1)
+
+        def test_adjust_saturation(np_img, tensor_img):
+            result_pil = np.array(F.adjust_saturation(np_img, 1.0))
+            result_tensor = F.adjust_saturation(tensor_img, 1.0).numpy()
+            result_tensor = np.transpose(result_tensor * 255., (1, 2, 0))
+            diff = np.max(np.abs(result_tensor - result_pil))
+            self.assertTrue(diff < 1.1)
+
+        def test_adjust_hue(np_img, tensor_img):
+            result_pil = np.array(F.adjust_hue(np_img, 0.45))
+            result_tensor = F.adjust_hue(tensor_img, 0.45).numpy()
+            result_tensor = np.transpose(result_tensor * 255, (1, 2, 0))
+            diff = np.max(np.abs(result_tensor - result_pil))
+            self.assertTrue(diff <= 16.0)
+
+        for place in places:
+            paddle.set_device(place)
+
+            test_adjust_brightness(np_img, tensor_img)
+            test_adjust_contrast(np_img, tensor_img)
+            test_adjust_saturation(np_img, tensor_img)
+            test_adjust_hue(np_img, tensor_img)
+
     def test_pad(self):
         np_img = (np.random.rand(28, 24, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
@@ -637,6 +725,47 @@ def test_to_tensor(self):
         pil_img = Image.fromarray(np_img).convert('YCbCr')
         pil_tensor = F.to_tensor(pil_img)
 
+    def test_erase(self):
+        np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+
+        expected = np_img.copy()
+        expected[10:15, 10:15, :] = 0
+
+        F.erase(np_img, 10, 10, 5, 5, 0, inplace=True)
+        np.testing.assert_equal(np_img, expected)
+
+        pil_result = F.erase(pil_img, 10, 10, 5, 5, 0)
+        np.testing.assert_equal(np.array(pil_result), expected)
+
+        np_data = np.random.rand(3, 28, 28).astype('float32')
+        places = ['cpu']
+        if paddle.device.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            tensor_img = paddle.to_tensor(np_data)
+            expected_tensor = tensor_img.clone()
+            expected_tensor[:, 10:15, 10:15] = paddle.to_tensor([0.88])
+
+            tensor_result = F.erase(tensor_img, 10, 10, 5, 5,
+                                    paddle.to_tensor([0.88]))
+            np.testing.assert_equal(tensor_result.numpy(),
+                                    expected_tensor.numpy())
+
+    def test_erase_backward(self):
+        img = paddle.randn((3, 14, 14), dtype=np.float32)
+        img.stop_gradient = False
+        erased = F.erase(
+            img, 3, 3, 5, 5, paddle.ones(
+                (1, 1, 1), dtype='float32'))
+        loss = erased.sum()
+        loss.backward()
+
+        expected_grad = np.ones((3, 14, 14), dtype=np.float32)
+        expected_grad[:, 3:8, 3:8] = 0.
+        np.testing.assert_equal(img.grad.numpy(), expected_grad)
+
     def test_image_load(self):
         fake_img = Image.fromarray((np.random.random((32, 32, 3)) * 255).astype(
             'uint8'))
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index d401e7c5190fe..f078aae9bb6b1 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -466,6 +466,7 @@
     func : DeformableConvInferMeta
   kernel :
     func : deformable_conv
+    data_type : x
   optional : mask
   backward : deformable_conv_grad
 
@@ -546,6 +547,7 @@
     func : DropoutInferMeta
   kernel :
     func : dropout
+    data_type : x
   optional : seed_tensor
   backward : dropout_grad
 
@@ -559,6 +561,16 @@
     func : eigh
   backward : eigh_grad
 
+- api : einsum
+  args : (Tensor[] x, str equation)
+  output : Tensor
+  infer_meta :
+    func : EinsumInferMeta
+    param : [x, equation]
+  kernel :
+    func : einsum
+  backward : einsum_grad
+
 - api : elementwise_pow
   args : (Tensor x, Tensor y)
   output : Tensor(out)
@@ -1065,6 +1077,7 @@
     func : LayerNormInferMeta
   kernel :
     func : layer_norm
+    data_type : x
   backward : layer_norm_grad
   optional : scale, bias
 
@@ -1608,6 +1621,7 @@
     func : PsroiPoolInferMeta
   kernel :
     func : psroi_pool
+    data_type : x
   optional : boxes_num
   backward : psroi_pool_grad
 
@@ -1713,6 +1727,7 @@
     func : RoiAlignInferMeta
   kernel :
     func : roi_align
+    data_type : x
   optional : boxes_num
   backward : roi_align_grad
 
@@ -1723,6 +1738,7 @@
     func : RoiPoolInferMeta
   kernel :
     func : roi_pool
+    data_type : x
   optional : boxes_num
   intermediate : arg_max
   backward : roi_pool_grad
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 3b47470139b90..e044447f87c22 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -152,6 +152,18 @@
   kernel :
     func : atanh_grad
 
+- backward_api : batch_norm_double_grad
+  forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
+  args : (Tensor x, Tensor scale, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor grad_out,  Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x, scale, x]
+  kernel :
+    func : batch_norm_grad_grad
+    data_type : x
+  optional : out_mean, out_variance
+
 - backward_api : batch_norm_grad
   forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
   args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
@@ -163,6 +175,7 @@
     func : batch_norm_grad
     data_type : out_grad
   optional : mean_out, variance_out, reserve_space
+  backward : batch_norm_double_grad
 
 - backward_api : bce_loss_grad
   forward : bce_loss (Tensor input, Tensor label) -> Tensor(out)
@@ -362,6 +375,7 @@
     func : DeformableConvGradInferMeta
   kernel :
     func : deformable_conv_grad
+    data_type : x
   optional : mask
 
 - backward_api : depthwise_conv2d_transpose_grad
@@ -414,6 +428,18 @@
   kernel :
     func : dist_grad
 
+- backward_api : divide_double_grad
+  forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
+  args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
+  output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [y, grad_x, grad_x]
+  kernel :
+    func : divide_double_grad
+    data_type : out
+  optional : grad_x_grad, grad_y_grad
+
 - backward_api : divide_grad
   forward : divide (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, int axis = -1)
@@ -423,6 +449,7 @@
     param : [x, y]
   kernel :
     func : divide_grad
+  backward : divide_double_grad
 
 - backward_api : dropout_grad
   forward : dropout (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(mask)
@@ -444,6 +471,19 @@
     param : [out_v]
   kernel :
     func : eigh_grad
+    data_type : out_v
+  data_transform:
+    skip_transform : out_w, out_w_grad
+
+- backward_api : einsum_grad
+  forward : einsum (Tensor[] x, str equation) -> Tensor(out)
+  args : (Tensor[] x, Tensor out_grad, str equation)
+  output : Tensor[](x_grad){x.size()}
+  infer_meta :
+    func : UnchangedMultiInferMeta
+    param : [x]
+  kernel :
+    func : einsum_grad
 
 - backward_api : elementwise_pow_grad
   forward : elementwise_pow(Tensor x, Tensor y) -> Tensor(out)
@@ -455,6 +495,16 @@
   kernel :
     func : elementwise_pow_grad
 
+- backward_api : elu_double_grad
+  forward : elu_grad (Tensor x, Tensor out, Tensor grad_out, float alpha)-> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : elu_double_grad
+
 - backward_api : elu_grad
   forward : elu (Tensor x, float alpha) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, float alpha)
@@ -464,6 +514,7 @@
     param : [x]
   kernel :
     func : elu_grad
+  backward : elu_double_grad
 
 - backward_api : erf_grad
   forward : erf (Tensor x) -> Tensor(out)
@@ -633,6 +684,7 @@
     param : [x]
   kernel :
     func : graph_send_recv_grad
+    data_type : out_grad
   optional: out, dst_count
 
 - backward_api : gumbel_softmax_grad
@@ -1287,6 +1339,7 @@
     param : [x]
   kernel :
     func : psroi_pool_grad
+    data_type : x
   optional : boxes_num
 
 # output is optional
@@ -1381,6 +1434,7 @@
     param : [x]
   kernel :
     func : roi_align_grad
+    data_type : boxes
   optional : boxes_num
 
 - backward_api : roi_pool_grad
@@ -1392,6 +1446,7 @@
     param : [x]
   kernel :
     func : roi_pool_grad
+    data_type : x
   optional : boxes_num
 
 - backward_api : roll_grad
@@ -1498,7 +1553,7 @@
     func : UnchangedInferMeta
     param : [x]
   kernel :
-    func : sigmoid_cross_entropy_with_logits_grad 
+    func : sigmoid_cross_entropy_with_logits_grad
 
 - backward_api : sigmoid_double_grad
   forward : sigmoid_grad (Tensor out, Tensor fwd_grad_out) -> Tensor(grad_x)
@@ -1654,6 +1709,18 @@
     func : strided_slice_grad
   no_need_buffer : x
 
+- backward_api : subtract_double_grad
+  forward : subtract_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
+  args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [grad_out]
+  kernel :
+    func : subtract_double_grad
+  optional : grad_x_grad, grad_y_grad
+  no_need_buffer : y, grad_out
+
 - backward_api : subtract_grad
   forward : subtract (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
@@ -1664,6 +1731,7 @@
   kernel :
     func : subtract_grad
   no_need_buffer : x, y
+  backward : subtract_double_grad
 
 - backward_api : sum_double_grad
   forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x)
@@ -1720,6 +1788,17 @@
   kernel :
     func : tan_grad
 
+- backward_api : tanh_double_grad
+  forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_out, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : tanh_double_grad
+  backward : tanh_triple_grad
+
 - backward_api : tanh_grad
   forward : tanh (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
@@ -1729,6 +1808,7 @@
     param : [out]
   kernel :
     func : tanh_grad
+  backward : tanh_double_grad
 
 - backward_api : tanh_shrink_grad
   forward : tanh_shrink (Tensor x) -> Tensor(out)
@@ -1740,6 +1820,16 @@
   kernel :
     func : tanh_shrink_grad
 
+- backward_api : tanh_triple_grad
+  forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad)
+  args : (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward, Tensor grad_out_new_grad, Tensor grad_out_grad_grad)
+  output : Tensor(out_grad), Tensor(grad_out_forward_grad), Tensor(grad_x_grad_forward_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [out, out, grad_x_grad_forward]
+  kernel :
+    func : tanh_triple_grad
+
 - backward_api : thresholded_relu_grad
   forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float threshold)
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 100d7ad78319b..ca4330f2af362 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -65,3 +65,12 @@
   args : (Tensor x)
   output : Tensor(out@SparseCsrTensor)
   invoke : to_sparse_csr_impl(x)
+
+- api: maxpool
+  args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
+  output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  kernel :
+    func : sparse_maxpool
+    layout : x
+  intermediate : rulebook
+  backward : sparse_maxpool_grad
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index e3946cbf72bc2..74299ed3e39a0 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -32,6 +32,13 @@
   output : Tensor(x_grad@DenseTensor)
   invoke : to_dense_impl(out_grad)
 
+- backward_api : sparse_maxpool_grad
+  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+  output : Tensor(x_grad@SparseCooTensor)
+  kernel :
+    func : sparse_maxpool_grad
+
 - backward_api : sparse_relu_grad
   forward : sparse_relu(Tensor x) -> Tensor(out@SparseCooTensor)
   args : (Tensor x, Tensor out_grad)
diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
index 413f09f78699e..b255e663e6876 100644
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
@@ -31,6 +31,7 @@
 from .transforms import RandomRotation  # noqa: F401
 from .transforms import Grayscale  # noqa: F401
 from .transforms import ToTensor  # noqa: F401
+from .transforms import RandomErasing  # noqa: F401
 from .functional import to_tensor  # noqa: F401
 from .functional import hflip  # noqa: F401
 from .functional import vflip  # noqa: F401
@@ -44,6 +45,7 @@
 from .functional import adjust_contrast  # noqa: F401
 from .functional import adjust_hue  # noqa: F401
 from .functional import normalize  # noqa: F401
+from .functional import erase  # noqa: F401
 
 __all__ = [ #noqa
     'BaseTransform',
@@ -65,6 +67,7 @@
     'RandomRotation',
     'Grayscale',
     'ToTensor',
+    'RandomErasing',
     'to_tensor',
     'hflip',
     'vflip',
@@ -77,5 +80,6 @@
     'adjust_brightness',
     'adjust_contrast',
     'adjust_hue',
-    'normalize'
+    'normalize',
+    'erase',
 ]
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 8caab964bf87b..5a8c2cc09f884 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -370,13 +370,13 @@ def adjust_brightness(img, brightness_factor):
     """Adjusts brightness of an Image.
 
     Args:
-        img (PIL.Image|np.array): Image to be adjusted.
+        img (PIL.Image|np.array|paddle.Tensor): Image to be adjusted.
         brightness_factor (float): How much to adjust the brightness. Can be
             any non negative number. 0 gives a black image, 1 gives the
             original image while 2 increases the brightness by a factor of 2.
 
     Returns:
-        PIL.Image or np.array: Brightness adjusted image.
+        PIL.Image|np.array|paddle.Tensor: Brightness adjusted image.
 
     Examples:
         .. code-block:: python
@@ -392,28 +392,31 @@ def adjust_brightness(img, brightness_factor):
             converted_img = F.adjust_brightness(fake_img, 0.4)
             print(converted_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_brightness(img, brightness_factor)
-    else:
+    elif _is_numpy_image(img):
         return F_cv2.adjust_brightness(img, brightness_factor)
+    else:
+        return F_t.adjust_brightness(img, brightness_factor)
 
 
 def adjust_contrast(img, contrast_factor):
     """Adjusts contrast of an Image.
 
     Args:
-        img (PIL.Image|np.array): Image to be adjusted.
+        img (PIL.Image|np.array|paddle.Tensor): Image to be adjusted.
         contrast_factor (float): How much to adjust the contrast. Can be any
             non negative number. 0 gives a solid gray image, 1 gives the
             original image while 2 increases the contrast by a factor of 2.
 
     Returns:
-        PIL.Image or np.array: Contrast adjusted image.
+        PIL.Image|np.array|paddle.Tensor: Contrast adjusted image.
 
     Examples:
         .. code-block:: python
@@ -429,28 +432,31 @@ def adjust_contrast(img, contrast_factor):
             converted_img = F.adjust_contrast(fake_img, 0.4)
             print(converted_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_contrast(img, contrast_factor)
-    else:
+    elif _is_numpy_image(img):
         return F_cv2.adjust_contrast(img, contrast_factor)
+    else:
+        return F_t.adjust_contrast(img, contrast_factor)
 
 
 def adjust_saturation(img, saturation_factor):
     """Adjusts color saturation of an image.
 
     Args:
-        img (PIL.Image|np.array): Image to be adjusted.
+        img (PIL.Image|np.array|paddle.Tensor): Image to be adjusted.
         saturation_factor (float):  How much to adjust the saturation. 0 will
             give a black and white image, 1 will give the original image while
             2 will enhance the saturation by a factor of 2.
 
     Returns:
-        PIL.Image or np.array: Saturation adjusted image.
+        PIL.Image|np.array|paddle.Tensor: Saturation adjusted image.
 
     Examples:
         .. code-block:: python
@@ -467,15 +473,18 @@ def adjust_saturation(img, saturation_factor):
             print(converted_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_saturation(img, saturation_factor)
-    else:
+    elif _is_numpy_image(img):
         return F_cv2.adjust_saturation(img, saturation_factor)
+    else:
+        return F_t.adjust_saturation(img, saturation_factor)
 
 
 def adjust_hue(img, hue_factor):
@@ -489,7 +498,7 @@ def adjust_hue(img, hue_factor):
     interval `[-0.5, 0.5]`.
 
     Args:
-        img (PIL.Image|np.array): Image to be adjusted.
+        img (PIL.Image|np.array|paddle.Tensor): Image to be adjusted.
         hue_factor (float):  How much to shift the hue channel. Should be in
             [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
             HSV space in positive and negative direction respectively.
@@ -497,7 +506,7 @@ def adjust_hue(img, hue_factor):
             with complementary colors while 0 gives the original image.
 
     Returns:
-        PIL.Image or np.array: Hue adjusted image.
+        PIL.Image|np.array|paddle.Tensor: Hue adjusted image.
 
     Examples:
         .. code-block:: python
@@ -514,15 +523,18 @@ def adjust_hue(img, hue_factor):
             print(converted_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_hue(img, hue_factor)
-    else:
+    elif _is_numpy_image(img):
         return F_cv2.adjust_hue(img, hue_factor)
+    else:
+        return F_t.adjust_hue(img, hue_factor)
 
 
 def rotate(img,
@@ -677,3 +689,39 @@ def normalize(img, mean, std, data_format='CHW', to_rgb=False):
             img = np.array(img).astype(np.float32)
 
         return F_cv2.normalize(img, mean, std, data_format, to_rgb)
+
+
+def erase(img, i, j, h, w, v, inplace=False):
+    """Erase the pixels of selected area in input image with given value.
+    
+        Args:
+            img (paddle.Tensor | np.array | PIL.Image): input Tensor image. 
+                 For Tensor input, the shape should be (C, H, W). For np.array input, 
+                 the shape should be (H, W, C).
+            i (int): y coordinate of the top-left point of erased region.
+            j (int): x coordinate of the top-left point of erased region.
+            h (int): Height of the erased region.
+            w (int): Width of the erased region.
+            v (paddle.Tensor | np.array): value used to replace the pixels in erased region. It 
+                should be np.array when img is np.array or PIL.Image.
+            inplace (bool, optional): Whether this transform is inplace. Default: False.
+
+        Returns:
+            paddle.Tensor | np.array | PIL.Image: Erased image. The type is same with input image.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                
+                fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
+                values = paddle.zeros((1,1,1), dtype=paddle.float32)
+                result = paddle.vision.transforms.erase(fake_img, 4, 4, 3, 3, values)
+
+    """
+    if _is_tensor_image(img):
+        return F_t.erase(img, i, j, h, w, v, inplace=inplace)
+    elif _is_pil_image(img):
+        return F_pil.erase(img, i, j, h, w, v, inplace=inplace)
+    else:
+        return F_cv2.erase(img, i, j, h, w, v, inplace=inplace)
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index 38b50898be606..8343a8c340ffb 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -564,3 +564,26 @@ def normalize(img, mean, std, data_format='CHW', to_rgb=False):
 
     img = (img - mean) / std
     return img
+
+
+def erase(img, i, j, h, w, v, inplace=False):
+    """Erase the pixels of selected area in input image array with given value.
+
+       Args:
+            img (np.array): input image array, which shape is (H, W, C).
+            i (int): y coordinate of the top-left point of erased region.
+            j (int): x coordinate of the top-left point of erased region.
+            h (int): Height of the erased region.
+            w (int): Width of the erased region.
+            v (np.array): value used to replace the pixels in erased region.
+            inplace (bool, optional): Whether this transform is inplace. Default: False.
+
+        Returns:
+            np.array: Erased image.
+        
+    """
+    if not inplace:
+        img = img.copy()
+
+    img[i:i + h, j:j + w, ...] = v
+    return img
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index b3ff37d7ea3bb..71f7759f11b66 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -32,14 +32,25 @@
     Sequence = collections.abc.Sequence
     Iterable = collections.abc.Iterable
 
-_pil_interp_from_str = {
-    'nearest': Image.NEAREST,
-    'bilinear': Image.BILINEAR,
-    'bicubic': Image.BICUBIC,
-    'box': Image.BOX,
-    'lanczos': Image.LANCZOS,
-    'hamming': Image.HAMMING
-}
+try:
+    # PIL version >= "9.1.0"
+    _pil_interp_from_str = {
+        'nearest': Image.Resampling.NEAREST,
+        'bilinear': Image.Resampling.BILINEAR,
+        'bicubic': Image.Resampling.BICUBIC,
+        'box': Image.Resampling.BOX,
+        'lanczos': Image.Resampling.LANCZOS,
+        'hamming': Image.Resampling.HAMMING
+    }
+except:
+    _pil_interp_from_str = {
+        'nearest': Image.NEAREST,
+        'bilinear': Image.BILINEAR,
+        'bicubic': Image.BICUBIC,
+        'box': Image.BOX,
+        'lanczos': Image.LANCZOS,
+        'hamming': Image.HAMMING
+    }
 
 __all__ = []
 
@@ -469,3 +480,26 @@ def to_grayscale(img, num_output_channels=1):
         raise ValueError('num_output_channels should be either 1 or 3')
 
     return img
+
+
+def erase(img, i, j, h, w, v, inplace=False):
+    """Erase the pixels of selected area in input image with given value. PIL format is
+        not support inplace.
+
+       Args:
+            img (PIL.Image): input image, which shape is (C, H, W).
+            i (int): y coordinate of the top-left point of erased region.
+            j (int): x coordinate of the top-left point of erased region.
+            h (int): Height of the erased region.
+            w (int): Width of the erased region.
+            v (np.array): value used to replace the pixels in erased region.
+            inplace (bool, optional): Whether this transform is inplace. Default: False.
+
+        Returns:
+            PIL.Image: Erased image.
+        
+    """
+    np_img = np.array(img, dtype=np.uint8)
+    np_img[i:i + h, j:j + w, ...] = v
+    img = Image.fromarray(np_img, 'RGB')
+    return img
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index 5e5cf465425ed..2e276883cd376 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -86,6 +86,68 @@ def _get_image_size(img, data_format):
         _get_image_h_axis(data_format)]
 
 
+def _rgb_to_hsv(img):
+    """Convert a image Tensor from RGB to HSV. This implementation is based on Pillow (
+            https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Convert.c)
+    """
+    maxc = img.max(axis=-3)
+    minc = img.min(axis=-3)
+
+    is_equal = paddle.equal(maxc, minc)
+    one_divisor = paddle.ones_like(maxc)
+    c_delta = maxc - minc
+    # s is 0 when maxc == minc, set the divisor to 1 to avoid zero divide.
+    s = c_delta / paddle.where(is_equal, one_divisor, maxc)
+
+    r, g, b = img.unbind(axis=-3)
+    c_delta_divisor = paddle.where(is_equal, one_divisor, c_delta)
+    # when maxc == minc, there is r == g == b, set the divisor to 1 to avoid zero divide.
+    rc = (maxc - r) / c_delta_divisor
+    gc = (maxc - g) / c_delta_divisor
+    bc = (maxc - b) / c_delta_divisor
+
+    hr = (maxc == r).astype(maxc.dtype) * (bc - gc)
+    hg = ((maxc == g) & (maxc != r)).astype(maxc.dtype) * (rc - bc + 2.0)
+    hb = ((maxc != r) & (maxc != g)).astype(maxc.dtype) * (gc - rc + 4.0)
+    h = (hr + hg + hb) / 6.0 + 1.0
+    h = h - h.trunc()
+    return paddle.stack([h, s, maxc], axis=-3)
+
+
+def _hsv_to_rgb(img):
+    """Convert a image Tensor from HSV to RGB.
+    """
+    h, s, v = img.unbind(axis=-3)
+    f = h * 6.0
+    i = paddle.floor(f)
+    f = f - i
+    i = i.astype(paddle.int32) % 6
+
+    p = paddle.clip(v * (1.0 - s), 0.0, 1.0)
+    q = paddle.clip(v * (1.0 - s * f), 0.0, 1.0)
+    t = paddle.clip(v * (1.0 - s * (1.0 - f)), 0.0, 1.0)
+
+    mask = paddle.equal(
+        i.unsqueeze(axis=-3),
+        paddle.arange(
+            6, dtype=i.dtype).reshape((-1, 1, 1))).astype(img.dtype)
+    matrix = paddle.stack(
+        [
+            paddle.stack(
+                [v, q, p, p, t, v], axis=-3), paddle.stack(
+                    [t, v, v, q, p, p], axis=-3), paddle.stack(
+                        [p, p, t, v, v, q], axis=-3)
+        ],
+        axis=-4)
+    return paddle.einsum("...ijk, ...xijk -> ...xjk", mask, matrix)
+
+
+def _blend_images(img1, img2, ratio):
+    max_value = 1.0 if paddle.is_floating_point(img1) else 255.0
+    return paddle.lerp(img2, img1, float(ratio)).clip(
+        0, max_value).astype(img1.dtype)
+
+
 def normalize(img, mean, std, data_format='CHW'):
     """Normalizes a tensor image given mean and standard deviation.
 
@@ -354,6 +416,30 @@ def crop(img, top, left, height, width, data_format='CHW'):
         return img[top:top + height, left:left + width, :]
 
 
+def erase(img, i, j, h, w, v, inplace=False):
+    """Erase the pixels of selected area in input Tensor image with given value.
+
+       Args:
+            img (paddle.Tensor): input Tensor image.
+            i (int): y coordinate of the top-left point of erased region.
+            j (int): x coordinate of the top-left point of erased region.
+            h (int): Height of the erased region.
+            w (int): Width of the erased region.
+            v (paddle.Tensor): value used to replace the pixels in erased region.
+            inplace (bool, optional): Whether this transform is inplace. Default: False.
+
+        Returns:
+            paddle.Tensor: Erased image.
+        
+    """
+    _assert_image_tensor(img, 'CHW')
+    if not inplace:
+        img = img.clone()
+
+    img[..., i:i + h, j:j + w] = v
+    return img
+
+
 def center_crop(img, output_size, data_format='CHW'):
     """Crops the given paddle.Tensor Image and resize it to desired size.
 
@@ -514,3 +600,127 @@ def resize(img, size, interpolation='bilinear', data_format='CHW'):
         data_format='N' + data_format.upper())
 
     return img.squeeze(0)
+
+
+def adjust_brightness(img, brightness_factor):
+    """Adjusts brightness of an Image.
+
+    Args:
+        img (paddle.Tensor): Image to be adjusted.
+        brightness_factor (float): How much to adjust the brightness. Can be
+            any non negative number. 0 gives a black image, 1 gives the
+            original image while 2 increases the brightness by a factor of 2.
+
+    Returns:
+        paddle.Tensor: Brightness adjusted image.
+
+    """
+    _assert_image_tensor(img, 'CHW')
+    assert brightness_factor >= 0, "brightness_factor should be non-negative."
+    assert _get_image_num_channels(
+        img, 'CHW') in [1, 3], "channels of input should be either 1 or 3."
+
+    extreme_target = paddle.zeros_like(img, img.dtype)
+    return _blend_images(img, extreme_target, brightness_factor)
+
+
+def adjust_contrast(img, contrast_factor):
+    """Adjusts contrast of an image.
+
+    Args:
+        img (paddle.Tensor): Image to be adjusted.
+        contrast_factor (float): How much to adjust the contrast. Can be any
+            non negative number. 0 gives a solid gray image, 1 gives the
+            original image while 2 increases the contrast by a factor of 2.
+
+    Returns:
+        paddle.Tensor: Contrast adjusted image.
+
+    """
+    _assert_image_tensor(img, 'chw')
+    assert contrast_factor >= 0, "contrast_factor should be non-negative."
+
+    channels = _get_image_num_channels(img, 'CHW')
+    dtype = img.dtype if paddle.is_floating_point(img) else paddle.float32
+    if channels == 1:
+        extreme_target = paddle.mean(
+            img.astype(dtype), axis=(-3, -2, -1), keepdim=True)
+    elif channels == 3:
+        extreme_target = paddle.mean(
+            to_grayscale(img).astype(dtype), axis=(-3, -2, -1), keepdim=True)
+    else:
+        raise ValueError("channels of input should be either 1 or 3.")
+
+    return _blend_images(img, extreme_target, contrast_factor)
+
+
+def adjust_saturation(img, saturation_factor):
+    """Adjusts color saturation of an image.
+
+    Args:
+        img (paddle.Tensor): Image to be adjusted.
+        saturation_factor (float):  How much to adjust the saturation. 0 will
+            give a black and white image, 1 will give the original image while
+            2 will enhance the saturation by a factor of 2.
+
+    Returns:
+        paddle.Tensor: Saturation adjusted image.
+
+    """
+    _assert_image_tensor(img, 'CHW')
+    assert saturation_factor >= 0, "saturation_factor should be non-negative."
+    channels = _get_image_num_channels(img, 'CHW')
+    if channels == 1:
+        return img
+    elif channels == 3:
+        extreme_target = to_grayscale(img)
+    else:
+        raise ValueError("channels of input should be either 1 or 3.")
+
+    return _blend_images(img, extreme_target, saturation_factor)
+
+
+def adjust_hue(img, hue_factor):
+    """Adjusts hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    Args:
+        img (paddle.Tensor): Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        paddle.Tensor: Hue adjusted image.
+
+    """
+    _assert_image_tensor(img, 'CHW')
+    assert hue_factor >= -0.5 and hue_factor <= 0.5, "hue_factor should be in range [-0.5, 0.5]"
+    channels = _get_image_num_channels(img, 'CHW')
+    if channels == 1:
+        return img
+    elif channels == 3:
+        dtype = img.dtype
+        if dtype == paddle.uint8:
+            img = img.astype(paddle.float32) / 255.0
+
+        img_hsv = _rgb_to_hsv(img)
+        h, s, v = img_hsv.unbind(axis=-3)
+        h = (h + hue_factor)
+        h = h - h.floor()
+        img_adjusted = _hsv_to_rgb(paddle.stack([h, s, v], axis=-3))
+
+        if dtype == paddle.uint8:
+            img_adjusted = (img_adjusted * 255.0).astype(dtype)
+    else:
+        raise ValueError("channels of input should be either 1 or 3.")
+
+    return img_adjusted
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index a22f8a2ab4049..828a0d9b0936d 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -25,6 +25,7 @@
 import warnings
 import traceback
 
+import paddle
 from paddle.utils import try_import
 from . import functional as F
 
@@ -1342,3 +1343,143 @@ def _apply_image(self, img):
             PIL Image: Randomly grayscaled image.
         """
         return F.to_grayscale(img, self.num_output_channels)
+
+
+class RandomErasing(BaseTransform):
+    """Erase the pixels in a rectangle region selected randomly.
+
+    Args:
+        prob (float, optional): Probability of the input data being erased. Default: 0.5.
+        scale (sequence, optional): The proportional range of the erased area to the input image. 
+                                    Default: (0.02, 0.33).
+        ratio (sequence, optional): Aspect ratio range of the erased area. Default: (0.3, 3.3).
+        value (int|float|sequence|str, optional): The value each pixel in erased area will be replaced with.
+                               If value is a single number, all pixels will be erased with this value. 
+                               If value is a sequence with length 3, the R, G, B channels will be ereased 
+                               respectively. If value is set to "random", each pixel will be erased with 
+                               random values. Default: 0.
+        inplace (bool, optional): Whether this transform is inplace. Default: False.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+    
+    Shape:
+        - img(paddle.Tensor | np.array | PIL.Image): The input image. For Tensor input, the shape should be (C, H, W). 
+                 For np.array input, the shape should be (H, W, C).
+        - output(paddle.Tensor | np.array | PIL.Image): A random erased image.
+
+    Returns:
+        A callable object of RandomErasing.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            
+            fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
+            transform = paddle.vision.transforms.RandomErasing()
+            result = transform(fake_img) 
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 scale=(0.02, 0.33),
+                 ratio=(0.3, 3.3),
+                 value=0,
+                 inplace=False,
+                 keys=None):
+        super(RandomErasing, self).__init__(keys)
+        assert isinstance(scale,
+                          (tuple, list)), "scale should be a tuple or list"
+        assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
+                ), "scale should be of kind (min, max) and in range [0, 1]"
+        assert isinstance(ratio,
+                          (tuple, list)), "ratio should be a tuple or list"
+        assert (ratio[0] >= 0 and
+                ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
+        assert (prob >= 0 and
+                prob <= 1), "The probability should be in range [0, 1]"
+        assert isinstance(
+            value, (numbers.Number, str, tuple,
+                    list)), "value should be a number, tuple, list or str"
+        if isinstance(value, str) and value != "random":
+            raise ValueError("value must be 'random' when type is str")
+
+        self.prob = prob
+        self.scale = scale
+        self.ratio = ratio
+        self.value = value
+        self.inplace = inplace
+
+    def _get_param(self, img, scale, ratio, value):
+        """Get parameters for ``erase`` for a random erasing.
+
+        Args:
+            img (paddle.Tensor | np.array | PIL.Image): Image to be erased.
+            scale (sequence, optional): The proportional range of the erased area to the input image. 
+            ratio (sequence, optional): Aspect ratio range of the erased area.
+            value (sequence | None): The value each pixel in erased area will be replaced with.
+                               If value is a sequence with length 3, the R, G, B channels will be ereased 
+                               respectively. If value is None, each pixel will be erased with random values.
+
+        Returns:
+            tuple: params (i, j, h, w, v) to be passed to ``erase`` for random erase.
+        """
+        if F._is_pil_image(img):
+            shape = np.asarray(img).astype(np.uint8).shape
+            h, w, c = shape[-3], shape[-2], shape[-1]
+        elif F._is_numpy_image(img):
+            h, w, c = img.shape[-3], img.shape[-2], img.shape[-1]
+        elif F._is_tensor_image(img):
+            c, h, w = img.shape[-3], img.shape[-2], img.shape[-1]
+
+        img_area = h * w
+        log_ratio = np.log(ratio)
+        for _ in range(10):
+            erase_area = np.random.uniform(*scale) * img_area
+            aspect_ratio = np.exp(np.random.uniform(*log_ratio))
+            erase_h = int(round(np.sqrt(erase_area * aspect_ratio)))
+            erase_w = int(round(np.sqrt(erase_area / aspect_ratio)))
+            if erase_h >= h or erase_w >= w:
+                continue
+            if F._is_tensor_image(img):
+                if value is None:
+                    v = paddle.normal(
+                        shape=[c, erase_h, erase_w]).astype(img.dtype)
+                else:
+                    v = paddle.to_tensor(value, dtype=img.dtype)[:, None, None]
+            else:
+                if value is None:
+                    v = np.random.normal(size=[erase_h, erase_w, c]) * 255
+                else:
+                    v = np.array(value)[None, None, :]
+            top = np.random.randint(0, h - erase_h + 1)
+            left = np.random.randint(0, w - erase_w + 1)
+
+            return top, left, erase_h, erase_w, v
+
+        return 0, 0, h, w, img
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (paddle.Tensor | np.array | PIL.Image): Image to be Erased.
+
+        Returns:
+            output (paddle.Tensor np.array | PIL.Image): A random erased image.
+        """
+
+        if random.random() < self.prob:
+            if isinstance(self.value, numbers.Number):
+                value = [self.value]
+            elif isinstance(self.value, str):
+                value = None
+            else:
+                value = self.value
+            if value is not None and not (len(value) == 1 or len(value) == 3):
+                raise ValueError(
+                    "Value should be a single number or a sequence with length equals to image's channel."
+                )
+            top, left, erase_h, erase_w, v = self._get_param(img, self.scale,
+                                                             self.ratio, value)
+            return F.erase(img, top, left, erase_h, erase_w, v, self.inplace)
+        return img
diff --git a/python/requirements.txt b/python/requirements.txt
index 5f2b788a81a0a..e7fc6cd651cb0 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -6,3 +6,4 @@ six
 decorator
 astor
 paddle_bfloat==0.1.2
+opt_einsum==3.3.0
diff --git a/python/setup.py.in b/python/setup.py.in
index 0f231e34168d9..4cf8bc3fc6a2e 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -312,6 +312,8 @@ packages=['paddle',
           'paddle.distributed.auto_parallel.tuner',
           'paddle.distributed.auto_parallel.cost',
           'paddle.distributed.passes',
+          'paddle.distributed.models',
+          'paddle.distributed.models.moe',
           'paddle.framework',
           'paddle.jit',
           'paddle.jit.dy2static',
@@ -366,6 +368,10 @@ packages=['paddle',
           'paddle.incubate.nn.functional',
           'paddle.incubate.nn.layer',
           'paddle.incubate.optimizer.functional',
+          'paddle.incubate.distributed',
+          'paddle.incubate.distributed.models',
+          'paddle.incubate.distributed.models.moe',
+          'paddle.incubate.distributed.models.moe.gate',
           'paddle.io',
           'paddle.optimizer',
           'paddle.nn',
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 45d4731ba1dba..630005bccbaf7 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -20,7 +20,7 @@ if [ -z ${BRANCH} ]; then
 fi
 
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
-approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
 failed_num=0
 echo_list=()
 
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index b0800a9cd845e..b2d2e792c995b 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -71,7 +71,7 @@ API_FILES=("CMakeLists.txt"
            "paddle/fluid/eager/backward.h"
            )
 
-approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
 git_files=`git diff --numstat upstream/$BRANCH| wc -l`
 git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
 failed_num=0
diff --git a/tools/check_ut.py b/tools/check_ut.py
index f5fe4c687dd78..fa50f5cc81f13 100644
--- a/tools/check_ut.py
+++ b/tools/check_ut.py
@@ -24,7 +24,7 @@ class PRChecker(object):
     """ PR Checker. """
 
     def __init__(self):
-        self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
+        self.github = Github(timeout=60)
         self.repo = None
 
     def check(self, filename, msg):
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 8e84eccc083f2..878660cefaf21 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -307,7 +307,7 @@ function gpu_op_benchmark {
 # The PR will pass quickly when get approval from specific person.
 # Xreki 12538138, luotao1 6836917, ZzSean 32410583
 set +x
-approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
+approval_line=$(curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
 if [ -n "${approval_line}" ]; then
   APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
   LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
diff --git a/tools/coverage/cuda_clean.py b/tools/coverage/cuda_clean.py
index 8c03edd078549..28142c869d04c 100644
--- a/tools/coverage/cuda_clean.py
+++ b/tools/coverage/cuda_clean.py
@@ -30,8 +30,7 @@ def get_pull(pull_id):
     Returns:
         github.PullRequest.PullRequest: The pull request.
     """
-    token = os.getenv('GITHUB_API_TOKEN')
-    github = Github(token, timeout=60)
+    github = Github(timeout=60)
     repo = github.get_repo('PaddlePaddle/Paddle')
     pull = repo.get_pull(pull_id)
 
diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py
index 12bd04a6907ea..33d9a8f6c78a3 100644
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
@@ -32,8 +32,7 @@ def get_pull(pull_id):
     Returns:
         github.PullRequest.PullRequest
     """
-    token = os.getenv('GITHUB_API_TOKEN')
-    github = Github(token, timeout=60)
+    github = Github(timeout=60)
     idx = 1
     while idx < 4:
         try:
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index f3e88286ca965..20399f1c2e630 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -24,8 +24,6 @@
 
 from github import Github
 
-token = os.getenv('GITHUB_API_TOKEN')
-
 
 def get_pull(pull_id):
     """
@@ -35,7 +33,7 @@ def get_pull(pull_id):
     Returns:
         github.PullRequest.PullRequest
     """
-    github = Github(token, timeout=60)
+    github = Github(timeout=60)
     repo = github.get_repo('PaddlePaddle/Paddle')
     pull = repo.get_pull(pull_id)
 
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index 715bd34b908be..08536ae401fe1 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -1,10 +1,10 @@
 # A image for building paddle binaries
 
 # build docker image
-# docker build -t paddlepaddle/paddle:ipu-dev-2.3.0 -f tools/dockerfile/Dockerfile.ipu .
+# docker build -t paddlepaddle/paddle:latest-dev-ipu -f tools/dockerfile/Dockerfile.ipu .
 
 # run a container
-# docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:ipu-dev-2.3.0 bash
+# docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
 
 FROM graphcore/poplar:2.3.0
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 6b90a656f0107..799f80f139c9c 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -35,7 +35,7 @@ class PRChecker(object):
     """ PR Checker. """
 
     def __init__(self):
-        self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
+        self.github = Github(timeout=60)
         self.repo = self.github.get_repo('PaddlePaddle/Paddle')
         self.py_prog_oneline = re.compile('\d+\|\s*#.*')
         self.py_prog_multiline_a = re.compile('\d+\|\s*r?""".*?"""', re.DOTALL)
diff --git a/tools/get_ut_mem_map.py b/tools/get_ut_mem_map.py
new file mode 100644
index 0000000000000..daf80597d3ad0
--- /dev/null
+++ b/tools/get_ut_mem_map.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+
+
+def get_ut_mem(rootPath):
+    case_dic = {}
+    for parent, dirs, files in os.walk(rootPath):
+        for f in files:
+            if f.endswith('$-gpu.log'):
+                continue
+            ut = f.replace('^', '').replace('$.log', '')
+            case_dic[ut] = {}
+            filename = '%s%s' % (parent, f)
+            fi = open(filename)
+            lines = fi.readlines()
+            mem_reserved1 = -1
+            mem_nvidia1 = -1
+            caseTime = -1
+            for line in lines:
+                if '[Memory Usage (Byte)] gpu' in line:
+                    mem_reserved = round(
+                        float(
+                            line.split('[max memory reserved] gpu')[1].split(
+                                ':')[1].split('\\n')[0].strip()), 2)
+                    if mem_reserved > mem_reserved1:
+                        mem_reserved1 = mem_reserved
+                if 'MAX_GPU_MEMORY_USE=' in line:
+                    mem_nvidia = round(
+                        float(
+                            line.split('MAX_GPU_MEMORY_USE=')[1].split('\\n')[0]
+                            .strip()), 2)
+                    if mem_nvidia > mem_nvidia1:
+                        mem_nvidia1 = mem_nvidia
+                if 'Total Test time (real)' in line:
+                    caseTime = float(
+                        line.split('Total Test time (real) =')[1].split('sec')[
+                            0].strip())
+            if mem_reserved1 != -1:
+                case_dic[ut]['mem_reserved'] = mem_reserved1
+            if mem_nvidia1 != -1:
+                case_dic[ut]['mem_nvidia'] = mem_nvidia1
+            if caseTime != -1:
+                case_dic[ut]['time'] = caseTime
+
+    ut_mem_map_file = "/pre_test/ut_mem_map.json" % rootPath
+    with open(ut_mem_map_file, "w") as f:
+        json.dump(case_dic, f)
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    get_ut_mem(rootPath)
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index 0b67c6ba44a1d..b83bfe911aa48 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -20,12 +20,12 @@
 
 #TODO @DannyIsFunny: more attr types need to be supported.
 attr_type_converter = {
-    "i": 'SI32Attr',
-    "b": 'BoolAttr',
-    "l": 'SI64Attr',
-    "f": 'F32Attr',
-    "NSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE": 'StrAttr',
-    "St6vectorIiSaIiEE": 'I32ArrayAttr'
+    "int": 'SI32Attr',
+    "bool": 'BoolAttr',
+    "int64_t": 'SI64Attr',
+    "float": 'F32Attr',
+    "string": 'StrAttr',
+    "vector<int>": 'I32ArrayAttr'
 }
 
 target_type_converter = {"CPU": "CPU", "GPU": "GPU", "Undefined": "UNK"}
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 5dcff12c2c87e..aaa667595f94c 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -387,6 +387,7 @@
     'test_partial_sum_op',
     'test_pass_builder',
     'test_pixel_shuffle',
+    'test_pixel_unshuffle',
     'test_polygon_box_transform',
     'test_pool1d_api',
     'test_pool2d_api',
diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh
index bf70d8bc3a495..0b2fff045ff3c 100644
--- a/tools/test_ci_op_benchmark.sh
+++ b/tools/test_ci_op_benchmark.sh
@@ -319,7 +319,7 @@ function gpu_op_benchmark {
 # The PR will pass quickly when get approval from specific person.
 # Xreki 12538138, luotao1 6836917, ZzSean 32410583
 set +x
-approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
+approval_line=$(curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
 if [ -n "${approval_line}" ]; then
   APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
   LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 2d0c9c4a131c9..7ceed18634a87 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import importlib
+import paddle.fluid.core as core
 from six.moves import cStringIO
 
 sys.path.append(os.path.abspath(os.path.dirname(__file__)))
@@ -28,6 +29,10 @@
 
 def main():
     sys.path.append(os.getcwd())
+    if core.is_compiled_with_cuda() or core.is_compiled_with_rocm():
+        if (os.getenv('FLAGS_enable_gpu_memory_usage_log') == None):
+            os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true'
+
     some_test_failed = False
     for module_name in sys.argv[1:]:
         flag_need_static_mode = False
@@ -45,6 +50,7 @@ def main():
                     module = importlib.import_module(module_name)
                     tests = test_loader.loadTestsFromModule(module)
                     res = unittest.TextTestRunner(stream=buffer).run(tests)
+
                     if not res.wasSuccessful():
                         some_test_failed = True
                         print(
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index e102552f87c2b..fb173442a3319 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -13,11 +13,11 @@
 :: limitations under the License.
 ::
 :: ===============================
-:: Build Paddle compile enviroment
+:: Build Paddle compile environment
 :: ===============================
 :: Description:
 ::   
-::   Install compile enviroment for xly CI.
+::   Install compile environment for xly CI.
 ::
 ::   Include:
 ::     1. CMake 3.17.0
diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh
index 576f0e5d238ab..136e21e60415f 100644
--- a/tools/windows/check_change_of_unittest.sh
+++ b/tools/windows/check_change_of_unittest.sh
@@ -15,16 +15,15 @@
 set -e
 set +x
 export PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
-GITHUB_API_TOKEN=$GITHUB_API_TOKEN
 GIT_PR_ID=$AGILE_PULL_ID
 BRANCH=$BRANCH
-if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then
+if [ "${GIT_PR_ID}" == "" ];then
     exit 0 
 fi
 
 unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g')
 if [ "$unittest_spec_diff" != "" ]; then
-    approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+    approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
     APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955`
     echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
     if [ "${APPROVALS}" == "FALSE" ]; then