diff --git a/.clang-tidy b/.clang-tidy
index d5935e488ae..31aca7b30a7 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,4 +1,15 @@
-# maybe-* checks are only available on OneFlow custom clang-tidy and clangd
-Checks: '-*, maybe-*'
+# `maybe-*` checks are only available on OneFlow custom clang-tidy and clangd
+# `-allow-enabling-analyzer-alpha-checkers` should be passed to clang-tidy for CSA checkers named `clang-analyzer-alpha.*` (or `-allow-enabling-alpha-checkers` for run-clang-tidy.py)
+# `aggressive-binary-operation-simplification` should be enabled (via `-Xclang -analyzer-config -Xclang aggressive-binary-operation-simplification=true` in clang)
+# there is some problem in `clang-analyzer-alpha.clone.*`, so do not enable it
+# `clang-analyzer-alpha.deadcode.*` is just too verbose to enable
+Checks: '-*, maybe-*, clang-analyzer-core.*, clang-analyzer-cplusplus.*, clang-analyzer-nullability.*, clang-analyzer-deadcode.*, clang-analyzer-security.*, clang-analyzer-optin.cplusplus.*, clang-analyzer-optin.performance.*, clang-analyzer-alpha.core.*, clang-analyzer-alpha.cplusplus.*, clang-analyzer-alpha.security.*, cppcoreguidelines-avoid-goto, cppcoreguidelines-init-variables, cppcoreguidelines-interfaces-global-init, cppcoreguidelines-no-malloc, cppcoreguidelines-prefer-member-initializer, cppcoreguidelines-pro-type-member-init, cppcoreguidelines-pro-type-static-cast-downcast, cppcoreguidelines-slicing, cppcoreguidelines-special-member-functions, performance-unnecessary-value-param, performance-unnecessary-copy-initialization, performance-noexcept-move-constructor, performance-no-automatic-move, performance-move-const-arg, performance-implicit-conversion-in-loop, performance-for-range-copy, google-default-arguments, google-global-names-in-headers, google-explicit-constructor'
 # TODO: treat all maybe warnings as errors when existing warnings are all fixed
-WarningsAsErrors: 'maybe-unused'
+WarningsAsErrors: 'maybe-unused, clang-analyzer-nullability.*, clang-analyzer-cplusplus.*, performance-implicit-conversion-in-loop, performance-move-const-arg, performance-no-automatic-move, performance-noexcept-move-constructor, google-default-arguments, google-global-names-in-headers'
+
+CheckOptions:
+  # `cppcoreguidelines-special-member-functions` is enabled, refer to https://en.cppreference.com/w/cpp/language/rule_of_three
+  - key:             cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor
+    value:           True
+  - key:             performance-move-const-arg.CheckTriviallyCopyableMove
+    value:           False
diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml
index 862217aa37f..b5501d4824d 100644
--- a/.github/workflows/simple.yml
+++ b/.github/workflows/simple.yml
@@ -50,7 +50,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release \
             -DBUILD_TESTING=ON
           cmake --build . -j$(nproc) --target of_git_version oneflow_deps generate_functional of_cfgobj generate_py_cfg
-      - name: Run Maybe-related checks by clang-tidy
+      - name: Run clang-tidy for all translation units
         # use clang as compiler for correct compiler flags
         run: |
           cd build
@@ -62,7 +62,7 @@ jobs:
             -DBUILD_TESTING=ON \
             -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
           cd ..
-          ./run-clang-tidy.py -clang-tidy-binary ./clang-tidy-489012f-x86_64.AppImage -p build -quiet
+          ./run-clang-tidy.py -clang-tidy-binary ./clang-tidy-489012f-x86_64.AppImage -p build -quiet -allow-enabling-alpha-checkers -extra-arg="-Xclang" -extra-arg="-analyzer-config" -extra-arg="-Xclang" -extra-arg="aggressive-binary-operation-simplification=true" '^((?!third_party_install).)+(?<!cfg.cpp)(?<!pb.cc)$'
 
   hosted:
     name: CPU-only
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index fc3b86a9b29..6f28bdf03dd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -749,7 +749,7 @@ jobs:
         run: |
           git remote add upstream https://github.com/Oneflow-Inc/oneflow
           git fetch upstream
-      - name: Run Maybe-related checks by clang-tidy
+      - name: Run clang-tidy for modified files
         # use clang as compiler for correct compiler flags
         run: |
           cd build
@@ -761,4 +761,4 @@ jobs:
             -DBUILD_TESTING=ON \
             -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
           cd ..
-          git diff -U0 ${{ github.event.pull_request.base.sha }} | ./clang-tidy-diff.py -clang-tidy-binary ./clang-tidy-489012f-x86_64.AppImage -path build -quiet -j $(nproc) -p1
+          git diff -U0 ${{ github.event.pull_request.base.sha }} | ./clang-tidy-diff.py -clang-tidy-binary ./clang-tidy-489012f-x86_64.AppImage -path build -allow-enabling-alpha-checkers -j $(nproc) -p1 -extra-arg="-Xclang" -extra-arg="-analyzer-config" -extra-arg="-Xclang" -extra-arg="aggressive-binary-operation-simplification=true"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 807b71751ad..5f7d164c1d8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,7 @@ if (NOT THIRD_PARTY AND NOT ONEFLOW)
 endif()
 
 option(USE_CLANG_FORMAT "" OFF)
+option(USE_CLANG_TIDY "" OFF)
 option(BUILD_RDMA "" OFF)
 option(BUILD_CUDA "" ON)
 option(BUILD_TESTING "" OFF)
diff --git a/ci/check/run_clang_tidy.py b/ci/check/run_clang_tidy.py
new file mode 100644
index 00000000000..4679ee17850
--- /dev/null
+++ b/ci/check/run_clang_tidy.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python2
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import asyncio
+import argparse
+import subprocess
+import os
+
+
+def split_and_print(prefix, text):
+    lines = text.decode().splitlines(keepends=True)
+    prefixed = ""
+    for l in lines:
+        prefixed += f"{prefix} {l.strip()}"
+    if l.strip():
+        print(prefixed, flush=True)
+
+
+async def handle_stream(stream, cb):
+    while True:
+        line = await stream.readline()
+        if line:
+            cb(line)
+        else:
+            break
+
+
+async def run_command(cmd=None, dry=False, name=None):
+    if dry:
+        print(f"[dry] {cmd}")
+        return 0
+    process = await asyncio.create_subprocess_shell(
+        cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
+    )
+    l = lambda x: split_and_print(f"[{name}]" if name else "", x)
+    await asyncio.gather(
+        handle_stream(process.stdout, l), handle_stream(process.stderr, l),
+    )
+    await process.wait()
+    return process.returncode
+
+
+def download(build_dir, dry=False):
+    urls = [
+        "https://github.com/Oneflow-Inc/llvm-project/releases/download/latest/clang-tidy-489012f-x86_64.AppImage"
+        if os.getenv("CI")
+        else "https://oneflow-static.oss-cn-beijing.aliyuncs.com/bin/clang-tidy/linux-x86_64/clang-tidy.AppImage",
+        "https://raw.githubusercontent.com/oneflow-inc/llvm-project/maybe/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py",
+    ]
+    dst_dir = f"{build_dir}/cache/bin"
+    dst = [f"{dst_dir}/clang-tidy", f"{dst_dir}/clang-tidy-diff.py"]
+    if dry:
+        if os.path.isfile(dst[0]) and os.path.isfile(dst[1]):
+            return dst
+        else:
+            None
+    else:
+        assert subprocess.call(f"mkdir -p {dst_dir}", shell=True) == 0
+        for i, _dst in enumerate(dst):
+            assert subprocess.call(f"curl -L {urls[i]} -o {_dst}", shell=True) == 0
+            assert subprocess.call(f"chmod +x {_dst}", shell=True) == 0
+        return dst
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Runs clang-tidy on all of the source files."
+    )
+    parser.add_argument(
+        "--build_dir", required=True,
+    )
+    args = parser.parse_args()
+    loop = asyncio.get_event_loop()
+    downloaded = download(args.build_dir, dry=True)
+    if downloaded is None:
+        downloaded = download(args.build_dir)
+    promises = [
+        run_command(
+            f"cd .. && git diff -U0 master | {downloaded[1]} -clang-tidy-binary {downloaded[0]} -path {args.build_dir} -j $(nproc) -p1 -allow-enabling-alpha-checkers -extra-arg=-Xclang -extra-arg=-analyzer-config -extra-arg=-Xclang -extra-arg=aggressive-binary-operation-simplification=true"
+        )
+    ]
+    loop.run_until_complete(asyncio.gather(*promises))
diff --git a/cmake/caches/cn/fast/cpu.cmake b/cmake/caches/cn/fast/cpu.cmake
index 00d7706c2b4..5bc2a2786de 100644
--- a/cmake/caches/cn/fast/cpu.cmake
+++ b/cmake/caches/cn/fast/cpu.cmake
@@ -7,3 +7,4 @@ set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
 set(CMAKE_GENERATOR Ninja CACHE STRING "")
 set(CMAKE_C_COMPILER_LAUNCHER sccache CACHE STRING "")
 set(CMAKE_CXX_COMPILER_LAUNCHER sccache CACHE STRING "")
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF CACHE BOOL "")
diff --git a/cmake/caches/cn/fast/cuda-61.cmake b/cmake/caches/cn/fast/cuda-61.cmake
index 5343c0f2707..37691b11fe2 100644
--- a/cmake/caches/cn/fast/cuda-61.cmake
+++ b/cmake/caches/cn/fast/cuda-61.cmake
@@ -8,3 +8,4 @@ set(CUDA_NVCC_GENCODES "arch=compute_61,code=sm_61" CACHE STRING "")
 set(CMAKE_C_COMPILER_LAUNCHER sccache CACHE STRING "")
 set(CMAKE_CXX_COMPILER_LAUNCHER sccache CACHE STRING "")
 set(CMAKE_CUDA_COMPILER_LAUNCHER sccache CACHE STRING "")
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF CACHE BOOL "")
diff --git a/cmake/caches/cn/fast/cuda-75.cmake b/cmake/caches/cn/fast/cuda-75.cmake
index 9d34efeb9b0..6dfdac91775 100644
--- a/cmake/caches/cn/fast/cuda-75.cmake
+++ b/cmake/caches/cn/fast/cuda-75.cmake
@@ -8,3 +8,4 @@ set(CUDA_NVCC_GENCODES "arch=compute_75,code=sm_75" CACHE STRING "")
 set(CMAKE_C_COMPILER_LAUNCHER sccache CACHE STRING "")
 set(CMAKE_CXX_COMPILER_LAUNCHER sccache CACHE STRING "")
 set(CMAKE_CUDA_COMPILER_LAUNCHER sccache CACHE STRING "")
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF CACHE BOOL "")
diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
index f440566be9f..79fd06c3027 100644
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -209,7 +209,11 @@ add_custom_target(of_format
   COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_clang_format.py --source_dir ${CMAKE_CURRENT_SOURCE_DIR}/oneflow --fix --quiet
   COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_py_format.py --source_dir ${CMAKE_CURRENT_SOURCE_DIR} --fix
   )
-
+# clang tidy
+add_custom_target(of_tidy
+  COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/ci/check/run_clang_tidy.py --build_dir ${CMAKE_BINARY_DIR}
+  DEPENDS of_git_version oneflow_deps generate_functional of_cfgobj generate_py_cfg
+  )
 # generate version
 set(OF_GIT_VERSION_DIR ${CMAKE_CURRENT_BINARY_DIR}/of_git_version)
 set(OF_GIT_VERSION_FILE ${OF_GIT_VERSION_DIR}/version.cpp)
@@ -292,6 +296,9 @@ add_dependencies(of_ccobj of_git_version)
 if (USE_CLANG_FORMAT)
   add_dependencies(of_ccobj of_format)
 endif()
+if (USE_CLANG_TIDY)
+  add_dependencies(of_ccobj of_tidy)
+endif()
 
 target_link_libraries(of_ccobj of_protoobj of_cfgobj ${ONEFLOW_CUDA_LIBS} glog_imported)
 
diff --git a/cmake/third_party/re2.cmake b/cmake/third_party/re2.cmake
index 773045863fd..bb18730507b 100644
--- a/cmake/third_party/re2.cmake
+++ b/cmake/third_party/re2.cmake
@@ -29,5 +29,6 @@ if (THIRD_PARTY)
           -DCMAKE_INSTALL_PREFIX:PATH=${RE2_INSTALL_DIR}
           -DCMAKE_INSTALL_LIBDIR:PATH=${RE2_LIBRARY_DIR}
           -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+          -DRE2_BUILD_TESTING:BOOL=OFF
           -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE})
 endif (THIRD_PARTY)
diff --git a/docs/source/oneflow.rst b/docs/source/oneflow.rst
index 97348b7a510..3f2d2322db4 100644
--- a/docs/source/oneflow.rst
+++ b/docs/source/oneflow.rst
@@ -123,5 +123,9 @@ oneflow
             zeros, 
             zeros_like,
             is_nonzero,
+            no_grad,
+            grad_enable,
+            inference_mode,
+            is_grad_enabled,
 
 .. autofunction:: oneflow.data.load_mnist(train_batch_size=100, test_batch_size=100, data_format='NCHW')
diff --git a/oneflow/api/python/autograd/autograd.cpp b/oneflow/api/python/autograd/autograd.cpp
index af6467b615e..89c7ec18128 100644
--- a/oneflow/api/python/autograd/autograd.cpp
+++ b/oneflow/api/python/autograd/autograd.cpp
@@ -71,7 +71,7 @@ Maybe<one::TensorTuple> Backward(const one::TensorTuple& outputs, const one::Ten
                                  bool retain_graph, bool create_graph) {
   if (create_graph) { retain_graph = true; }
   std::shared_ptr<one::TensorTuple> gradients = JUST(CheckAndInitOutGrads(outputs, out_grads));
-  JUST(one::GetThreadLocalAutogradEngine()->RunBackwardAndSaveGrads4LeafTensor(
+  JUST(one::GetThreadLocalAutogradEngine()->RunBackwardAndSaveGrads4LeafTensorIf(
       outputs, *gradients, retain_graph, create_graph));
   return std::make_shared<one::TensorTuple>(0);
 }
@@ -86,7 +86,7 @@ Maybe<one::TensorTuple> Grad(const one::TensorTuple& outputs, const one::TensorT
       [](const std::shared_ptr<one::Tensor>& tensor) { return tensor->requires_grad(); }))
       << "All input tensors `.requires_grad` should be true";
   std::shared_ptr<one::TensorTuple> gradients = JUST(CheckAndInitOutGrads(outputs, out_grads));
-  return one::GetThreadLocalAutogradEngine()->RunBackwardAndReturnInputsTensorGrad(
+  return one::GetThreadLocalAutogradEngine()->RunBackwardAndReturnInputsTensorGradIf(
       outputs, inputs, *gradients, retain_graph, create_graph);
 }
 
diff --git a/oneflow/api/python/autograd/no_grad_guard.cpp b/oneflow/api/python/autograd/autograd_mode.cpp
similarity index 73%
rename from oneflow/api/python/autograd/no_grad_guard.cpp
rename to oneflow/api/python/autograd/autograd_mode.cpp
index 7d8aea56ab7..975e3f87c9c 100644
--- a/oneflow/api/python/autograd/no_grad_guard.cpp
+++ b/oneflow/api/python/autograd/autograd_mode.cpp
@@ -26,11 +26,12 @@ namespace oneflow {
 namespace autograd {
 
 ONEFLOW_API_PYBIND11_MODULE("autograd", m) {
-  py::class_<NoGradGuard, std::shared_ptr<NoGradGuard>>(m, "no_grad")
-      .def(py::init([]() { return std::make_shared<NoGradGuard>(); }))
-      .def("__enter__", [](const NoGradGuard& no_grad_obj) {})
-      .def("__exit__", [](const NoGradGuard& no_grad_obj, const py::object& type,
+  py::class_<AutoGradMode, std::shared_ptr<AutoGradMode>>(m, "AutoGradMode")
+      .def(py::init([](bool mode) { return std::make_shared<AutoGradMode>(mode); }))
+      .def("__enter__", [](const AutoGradMode& no_grad_obj) {})
+      .def("__exit__", [](const AutoGradMode& no_grad_obj, const py::object& type,
                           const py::object& value, const py::object& traceback) {});
+  m.def("is_grad_enabled", &GradMode::is_enabled);
 }
 
 }  // namespace autograd
diff --git a/oneflow/api/python/framework/tensor.cpp b/oneflow/api/python/framework/tensor.cpp
index 74c7837355c..839be1cf1ad 100644
--- a/oneflow/api/python/framework/tensor.cpp
+++ b/oneflow/api/python/framework/tensor.cpp
@@ -30,6 +30,7 @@ limitations under the License.
 #include "oneflow/core/framework/tensor_method.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/stride.h"
+#include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/framework/py_distribute.h"
 #include "oneflow/core/functional/value_types.h"
 #include "oneflow/core/job/placement.cfg.h"
@@ -299,9 +300,8 @@ Maybe<Tensor> NewTensor(py::args args, py::kwargs kwargs, Symbol<DType> desired_
       if (other_tensor->is_local()) {
         if (placement) {
           // LocalTensor -> ConsistentTensor
-          tensor = JUST(functional::ToConsistent(other_tensor, placement, sbp_tuple,
-                                                 /* identity_grad */ false,
-                                                 /* grad_sbp_parallels */ {}));
+          tensor =
+              JUST(functional::ToConsistent(other_tensor, placement, sbp_tuple, GetNoneSbpList()));
         } else {
           // LocalTensor -> LocalTensor
           if (!device) { device = JUST(Device::New("cpu")); }
@@ -310,9 +310,8 @@ Maybe<Tensor> NewTensor(py::args args, py::kwargs kwargs, Symbol<DType> desired_
       } else {
         if (placement) {
           // ConsistentTensor -> ConsistentTensor
-          tensor = JUST(functional::ToConsistent(other_tensor, placement, sbp_tuple,
-                                                 /* identity_grad */ false,
-                                                 /* grad_sbp_parallels */ {}));
+          tensor =
+              JUST(functional::ToConsistent(other_tensor, placement, sbp_tuple, GetNoneSbpList()));
         } else {
           // ConsistentTensor -> LocalTensor
           tensor = JUST(functional::ConsistentToLocal(other_tensor));
diff --git a/oneflow/api/python/symbol/placement_symbol.cpp b/oneflow/api/python/symbol/placement_symbol.cpp
index d5e5bf5a0f2..0af7c1470c9 100644
--- a/oneflow/api/python/symbol/placement_symbol.cpp
+++ b/oneflow/api/python/symbol/placement_symbol.cpp
@@ -40,53 +40,6 @@ Maybe<Shape> MakeShape(const py::tuple& py_shape) {
   return std::make_shared<Shape>(shape_dims);
 }
 
-std::string SerializePlacementSymbol2String(Symbol<ParallelDesc> placement) {
-  std::string device_type = placement->device_tag() == "gpu" ? "\"cuda\"" : "\"cpu\"";
-  std::vector<int64_t> sorted_node_ids;
-  HashMap<int64_t, std::vector<int64_t>> node_id2sorted_dev_phy_ids;
-  for (int64_t machine_id : placement->sorted_machine_ids()) {
-    int64_t node_id = GlobalProcessCtx::NodeId(machine_id);
-    if (!std::count(sorted_node_ids.begin(), sorted_node_ids.end(), node_id)) {
-      sorted_node_ids.push_back(node_id);
-    }
-    for (int64_t device_id : placement->sorted_dev_phy_ids(machine_id)) {
-      node_id2sorted_dev_phy_ids[node_id].push_back(device_id);
-    }
-  }
-  std::string machine_device_ids = "{";
-  int64_t node_idx = 0;
-  for (int64_t node_id : sorted_node_ids) {
-    std::string device_name = std::to_string(node_id) + " : [";
-    int64_t device_idx = 0;
-    for (int64_t device_id : node_id2sorted_dev_phy_ids.at(node_id)) {
-      device_name += std::to_string(device_id);
-      if (++device_idx != node_id2sorted_dev_phy_ids.at(node_id).size()) { device_name += ", "; }
-    }
-    device_name += "]";
-    if (++node_idx != sorted_node_ids.size()) { device_name += ", "; }
-    machine_device_ids += device_name;
-  }
-  machine_device_ids += "}";
-  std::string hierarchy = "(";
-  int32_t hierarchy_dim_idx = 0;
-  for (int64_t dim : placement->hierarchy()->dim_vec()) {
-    hierarchy += std::to_string(dim);
-    if (++hierarchy_dim_idx != placement->hierarchy()->dim_vec().size()) {
-      hierarchy += ", ";
-    } else if (placement->hierarchy()->dim_vec().size() == 1) {
-      hierarchy += ",";
-    }
-  }
-  hierarchy += ")";
-  std::string placement_str = "oneflow.placement(device_type=" + device_type
-                              + ", machine_device_ids=" + machine_device_ids
-                              + ", hierarchy=" + hierarchy + ")";
-  return placement_str;
-}
-
-auto* CachedSerializePlacementSymbol2String =
-    DECORATE(&SerializePlacementSymbol2String, ThreadLocal);
-
 struct PlacementSymbolExportUtil {
   static std::shared_ptr<ParallelDesc> ApiCreatePlacementSymbol(
       int64_t symbol_id, const std::shared_ptr<cfg::ParallelConf>& symbol_conf) {
@@ -207,7 +160,7 @@ struct PlacementSymbolExportUtil {
   }
 
   static std::string PlacementSymbol2String(Symbol<ParallelDesc> placement) {
-    return CachedSerializePlacementSymbol2String(placement);
+    return *PlacementToString(placement).GetPtrOrThrow();
   }
 
   static Maybe<Symbol<ParallelDesc>> ReplacePlacementDeviceTag(Symbol<ParallelDesc> parallel_desc,
diff --git a/oneflow/api/python/symbol/sbp_symbol.cpp b/oneflow/api/python/symbol/sbp_symbol.cpp
index 9ed80e214f4..a6787ac0a7d 100644
--- a/oneflow/api/python/symbol/sbp_symbol.cpp
+++ b/oneflow/api/python/symbol/sbp_symbol.cpp
@@ -20,6 +20,7 @@ limitations under the License.
 #include "oneflow/core/common/constant.h"
 #include "oneflow/core/common/maybe.h"
 #include "oneflow/core/common/symbol.h"
+#include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/job/sbp_parallel.cfg.h"
 #include "oneflow/core/job/sbp_parallel.h"
 
@@ -30,17 +31,7 @@ namespace oneflow {
 namespace {
 
 std::string SbpParallelSymbolToString(const Symbol<cfg::SbpParallel>& sbp_sym) {
-  std::string sbp_str = "oneflow.sbp.";
-  if (sbp_sym->has_broadcast_parallel()) {
-    sbp_str += "broadcast";
-  } else if (sbp_sym->has_partial_sum_parallel()) {
-    sbp_str += "partial_sum";
-  } else if (sbp_sym->has_split_parallel()) {
-    sbp_str += "split(axis=" + std::to_string(sbp_sym->split_parallel().axis()) + ")";
-  } else {
-    UNIMPLEMENTED();
-  }
-  return sbp_str;
+  return *SbpToString(sbp_sym).GetPtrOrThrow();
 }
 
 Maybe<std::vector<Symbol<cfg::SbpParallel>>> MakeSplitSbpParallelList(int max_split_axis) {
diff --git a/oneflow/core/autograd/autograd_engine.cpp b/oneflow/core/autograd/autograd_engine.cpp
index d2ae9d31b7d..f1a43ca6046 100644
--- a/oneflow/core/autograd/autograd_engine.cpp
+++ b/oneflow/core/autograd/autograd_engine.cpp
@@ -21,6 +21,7 @@ limitations under the License.
 #include "oneflow/core/framework/tensor.h"
 #include "oneflow/core/framework/tensor_arg.h"
 #include "oneflow/core/framework/tensor_tuple.h"
+#include "oneflow/core/framework/tensor_rpc_util.h"
 #include "oneflow/core/autograd/autograd_mode.h"
 #include "oneflow/core/eager/dev_vm_dep_object_consume_mode.h"
 #include "oneflow/core/functional/functional.h"
@@ -56,20 +57,42 @@ Maybe<void> CopyOrAccGrad(AutogradMeta* autograd_meta, bool autograd_mode) {
   return Maybe<void>::Ok();
 }
 
+Maybe<void> RawTorchConsistentTensor(const std::shared_ptr<one::Tensor>& tensor) {
+  // Do nothing.
+  return Maybe<void>::Ok();
+}
+
+static constexpr auto* TorchConsistentTensor =
+    DECORATE(&RawTorchConsistentTensor, CheckConsistentTensorMeta);
+
+Maybe<void> CheckConsistentTensorsMeta(const TensorTuple& tensor_tuple) {
+  for (const auto& tensor : tensor_tuple) {
+    if (tensor->is_consistent()) { JUST(TorchConsistentTensor(tensor)); }
+  }
+  return Maybe<void>::Ok();
+}
+
 }  // namespace
 
-Maybe<void> AutogradEngine::RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
-                                                               const TensorTuple& out_grads,
-                                                               bool retain_graph,
-                                                               bool create_graph) {
-  return RunBackwardAndSaveGrads4LeafTensorIf(outputs, out_grads, retain_graph, create_graph);
+Maybe<void> AutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs,
+                                                                 const TensorTuple& out_grads,
+                                                                 bool retain_graph,
+                                                                 bool create_graph) {
+  JUST(CheckConsistentTensorsMeta(outputs));
+  JUST(CheckConsistentTensorsMeta(out_grads));
+  DisableCheckConsistentTensorMetaScope disable_meta_check;
+  return RunBackwardAndSaveGrads4LeafTensor(outputs, out_grads, retain_graph, create_graph);
 }
 
-Maybe<TensorTuple> AutogradEngine::RunBackwardAndReturnInputsTensorGrad(
+Maybe<TensorTuple> AutogradEngine::RunBackwardAndReturnInputsTensorGradIf(
     const TensorTuple& outputs, const TensorTuple& inputs, const TensorTuple& out_grads,
     bool retain_graph, bool create_graph) {
-  return RunBackwardAndReturnInputsTensorGradIf(outputs, inputs, out_grads, retain_graph,
-                                                create_graph);
+  JUST(CheckConsistentTensorsMeta(outputs));
+  JUST(CheckConsistentTensorsMeta(inputs));
+  JUST(CheckConsistentTensorsMeta(out_grads));
+  DisableCheckConsistentTensorMetaScope disable_meta_check;
+  return RunBackwardAndReturnInputsTensorGrad(outputs, inputs, out_grads, retain_graph,
+                                              create_graph);
 }
 
 StackFunctionNode::StackFunctionNode(
@@ -167,10 +190,10 @@ void StackAutogradEngine::ClearReleasedFunctionNodes() {
                    node_list_.end());
 }
 
-Maybe<void> StackAutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs,
-                                                                      const TensorTuple& out_grads,
-                                                                      bool retain_graph,
-                                                                      bool create_graph) {
+Maybe<void> StackAutogradEngine::RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
+                                                                    const TensorTuple& out_grads,
+                                                                    bool retain_graph,
+                                                                    bool create_graph) {
   ClearReleasedFunctionNodes();
   for (int i = 0; i < outputs.size(); ++i) {
     JUST(JUST(outputs.at(i)->current_grad())->PushPartialTensor(out_grads.at(i)));
@@ -190,7 +213,7 @@ Maybe<void> StackAutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const Tens
   return Maybe<void>::Ok();
 }
 
-Maybe<TensorTuple> StackAutogradEngine::RunBackwardAndReturnInputsTensorGradIf(
+Maybe<TensorTuple> StackAutogradEngine::RunBackwardAndReturnInputsTensorGrad(
     const TensorTuple& outputs, const TensorTuple& inputs, const TensorTuple& out_grads,
     bool retain_graph, bool create_graph) {
   ClearReleasedFunctionNodes();
@@ -396,10 +419,10 @@ Maybe<void> GraphTask::Apply(bool save_grad_for_leaf) {
   return Maybe<void>::Ok();
 }
 
-Maybe<void> GraphAutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs,
-                                                                      const TensorTuple& out_grads,
-                                                                      bool retain_graph,
-                                                                      bool create_graph) {
+Maybe<void> GraphAutogradEngine::RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
+                                                                    const TensorTuple& out_grads,
+                                                                    bool retain_graph,
+                                                                    bool create_graph) {
   for (int i = 0; i < outputs.size(); ++i) {
     JUST(JUST(outputs.at(i)->current_grad())->PushPartialTensor(out_grads.at(i)));
   }
@@ -409,7 +432,7 @@ Maybe<void> GraphAutogradEngine::RunBackwardAndSaveGrads4LeafTensorIf(const Tens
   return Maybe<void>::Ok();
 }
 
-Maybe<TensorTuple> GraphAutogradEngine::RunBackwardAndReturnInputsTensorGradIf(
+Maybe<TensorTuple> GraphAutogradEngine::RunBackwardAndReturnInputsTensorGrad(
     const TensorTuple& outputs, const TensorTuple& inputs, const TensorTuple& out_grads,
     bool retain_graph, bool create_graph) {
   std::shared_ptr<TensorTuple> input_current_grad = std::make_shared<TensorTuple>(inputs.size());
diff --git a/oneflow/core/autograd/autograd_engine.h b/oneflow/core/autograd/autograd_engine.h
index 5fe1230f0e7..f3f05cafec0 100644
--- a/oneflow/core/autograd/autograd_engine.h
+++ b/oneflow/core/autograd/autograd_engine.h
@@ -69,13 +69,13 @@ class AutogradEngine {
  public:
   virtual ~AutogradEngine() = default;
 
-  Maybe<void> RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
-                                                 const TensorTuple& out_grads, bool retain_graph,
-                                                 bool create_graph);
-  Maybe<TensorTuple> RunBackwardAndReturnInputsTensorGrad(const TensorTuple& outputs,
-                                                          const TensorTuple& inputs,
-                                                          const TensorTuple& out_grads,
-                                                          bool retain_graph, bool create_graph);
+  Maybe<void> RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs,
+                                                   const TensorTuple& out_grads, bool retain_graph,
+                                                   bool create_graph);
+  Maybe<TensorTuple> RunBackwardAndReturnInputsTensorGradIf(const TensorTuple& outputs,
+                                                            const TensorTuple& inputs,
+                                                            const TensorTuple& out_grads,
+                                                            bool retain_graph, bool create_graph);
   virtual void ClearEngine() = 0;
   // Builds FunctionNode, binding to all `outputs_` tensors and saving in AutogradEngine
   virtual Maybe<FunctionNode> AddBackwardFuncPtr(
@@ -88,15 +88,14 @@ class AutogradEngine {
   AutogradEngine() = default;
 
  private:
-  virtual Maybe<void> RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs,
-                                                           const TensorTuple& out_grads,
-                                                           bool retain_graph,
-                                                           bool create_graph) = 0;
-  virtual Maybe<TensorTuple> RunBackwardAndReturnInputsTensorGradIf(const TensorTuple& outputs,
-                                                                    const TensorTuple& inputs,
-                                                                    const TensorTuple& out_grads,
-                                                                    bool retain_graph,
-                                                                    bool create_graph) = 0;
+  virtual Maybe<void> RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
+                                                         const TensorTuple& out_grads,
+                                                         bool retain_graph, bool create_graph) = 0;
+  virtual Maybe<TensorTuple> RunBackwardAndReturnInputsTensorGrad(const TensorTuple& outputs,
+                                                                  const TensorTuple& inputs,
+                                                                  const TensorTuple& out_grads,
+                                                                  bool retain_graph,
+                                                                  bool create_graph) = 0;
 };
 
 // Stack Autograd Node and Engine
@@ -137,14 +136,14 @@ class StackAutogradEngine final : public AutogradEngine {
   // moment.
   std::list<std::weak_ptr<FunctionNode>> node_list_;
   void ClearReleasedFunctionNodes();
-  Maybe<void> RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs,
-                                                   const TensorTuple& out_grads, bool retain_graph,
-                                                   bool create_graph) override;
-  Maybe<TensorTuple> RunBackwardAndReturnInputsTensorGradIf(const TensorTuple& outputs,
-                                                            const TensorTuple& inputs,
-                                                            const TensorTuple& out_grads,
-                                                            bool retain_graph,
-                                                            bool create_graph) override;
+  Maybe<void> RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
+                                                 const TensorTuple& out_grads, bool retain_graph,
+                                                 bool create_graph) override;
+  Maybe<TensorTuple> RunBackwardAndReturnInputsTensorGrad(const TensorTuple& outputs,
+                                                          const TensorTuple& inputs,
+                                                          const TensorTuple& out_grads,
+                                                          bool retain_graph,
+                                                          bool create_graph) override;
 };
 
 // Graph Autograd Node and Engine
@@ -194,14 +193,14 @@ class GraphAutogradEngine final : public AutogradEngine {
       const TensorTuple& inputs, TensorTuple* outputs) override;
 
  private:
-  Maybe<void> RunBackwardAndSaveGrads4LeafTensorIf(const TensorTuple& outputs,
-                                                   const TensorTuple& out_grads, bool retain_graph,
-                                                   bool create_graph) override;
-  Maybe<TensorTuple> RunBackwardAndReturnInputsTensorGradIf(const TensorTuple& outputs,
-                                                            const TensorTuple& inputs,
-                                                            const TensorTuple& out_grads,
-                                                            bool retain_graph,
-                                                            bool create_graph) override;
+  Maybe<void> RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
+                                                 const TensorTuple& out_grads, bool retain_graph,
+                                                 bool create_graph) override;
+  Maybe<TensorTuple> RunBackwardAndReturnInputsTensorGrad(const TensorTuple& outputs,
+                                                          const TensorTuple& inputs,
+                                                          const TensorTuple& out_grads,
+                                                          bool retain_graph,
+                                                          bool create_graph) override;
 };
 
 AutogradEngine* GetThreadLocalAutogradEngine();
diff --git a/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp b/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp
new file mode 100644
index 00000000000..a6e4d504f1c
--- /dev/null
+++ b/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp
@@ -0,0 +1,73 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/id_util.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/framework/op_expr.h"
+#include "oneflow/core/framework/nd_sbp.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/common/optional.h"
+
+namespace oneflow {
+namespace one {
+
+struct ConsistentToConsistentState : public AutoGradCaptureState {
+  Symbol<ParallelDesc> parallel_desc;
+  Symbol<cfg::NdSbp> nd_sbp;
+};
+
+class ConsistentToConsistentGradFunction : public OpExprGradFunction<ConsistentToConsistentState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override {
+    const auto* fw_op_expr = dynamic_cast<const ConsistentToConsistentOpExpr*>(&op);
+    CHECK_NOTNULL_OR_RETURN(fw_op_expr);
+    grad_nd_sbp_ = fw_op_expr->grad_nd_sbp();
+    return Maybe<void>::Ok();
+  }
+
+  Maybe<void> Capture(ConsistentToConsistentState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs,
+                      const OpExprInterpContext& interp_ctx) const override {
+    CHECK_EQ_OR_RETURN(inputs.size(), 1);
+    ctx->parallel_desc = JUST(inputs.at(0)->parallel_desc());
+    ctx->nd_sbp = JUST(inputs.at(0)->nd_sbp());
+    return Maybe<void>::Ok();
+  }
+
+  Maybe<void> Apply(const ConsistentToConsistentState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    CHECK_EQ_OR_RETURN(out_grads.size(), 1);
+    const auto& out_grad = out_grads.at(0);
+    CHECK_OR_RETURN(out_grad->is_consistent());
+    in_grads->resize(1);
+    const auto& grad_nd_sbp =
+        grad_nd_sbp_.has_value() ? JUST(grad_nd_sbp_.value()) : JUST(out_grad->nd_sbp());
+    const auto& grad_sbp_list = JUST(GetSbpList(grad_nd_sbp));
+    const auto& grad_grad_sbp_list = JUST(GetSbpList(ctx->nd_sbp));
+    in_grads->at(0) = JUST(one::functional::ToConsistent(out_grad, ctx->parallel_desc,
+                                                         *grad_sbp_list, *grad_grad_sbp_list));
+    return Maybe<void>::Ok();
+  }
+
+ private:
+  Optional<Symbol<cfg::NdSbp>> grad_nd_sbp_;
+};
+
+REGISTER_OP_EXPR_GRAD_FUNCTION("consistent_to_consistent", ConsistentToConsistentGradFunction);
+
+}  // namespace one
+}  // namespace oneflow
diff --git a/oneflow/core/common/error.h b/oneflow/core/common/error.h
index 47a55a67dcd..9785b16b3d8 100644
--- a/oneflow/core/common/error.h
+++ b/oneflow/core/common/error.h
@@ -112,6 +112,12 @@ Error&& operator<<(Error&& error, const T& x) {
   return std::move(error);
 }
 
+template<>
+inline Error&& operator<<(Error&& error, const std::stringstream& x) {
+  error << x.str();
+  return std::move(error);
+}
+
 template<>
 inline Error&& operator<<(Error&& error, const std::ostream& x) {
   error << x.rdbuf();
diff --git a/oneflow/core/eager/eager_blob_object.cpp b/oneflow/core/eager/eager_blob_object.cpp
index 304465fdff2..5d187a0c207 100644
--- a/oneflow/core/eager/eager_blob_object.cpp
+++ b/oneflow/core/eager/eager_blob_object.cpp
@@ -47,9 +47,7 @@ Maybe<void> EagerBlobObject::InitBlob() {
   {
     header_buffer_.reset();
     int64_t header_byte_size = blob_desc_.AlignedByteSizeOfBlobHeader();
-    const auto& FreeHeader = [header_byte_size](char* dptr) { std::free(dptr); };
-    char* ptr = reinterpret_cast<char*>(std::malloc(header_byte_size));
-    header_buffer_ = std::unique_ptr<char, std::function<void(char*)>>(ptr, FreeHeader);
+    header_buffer_ = std::make_unique<char[]>(header_byte_size);
   }
   blob_.reset(new Blob(*mem_case_, &blob_desc_, header_buffer_.get(), nullptr));
   return Maybe<void>::Ok();
diff --git a/oneflow/core/eager/eager_blob_object.h b/oneflow/core/eager/eager_blob_object.h
index e81ff9e792d..aa43ead930e 100644
--- a/oneflow/core/eager/eager_blob_object.h
+++ b/oneflow/core/eager/eager_blob_object.h
@@ -90,7 +90,7 @@ class EagerBlobObject final : public BlobObject {
                   const Optional<LocalDepObject*>& dep_object);
 
   std::unique_ptr<Blob> blob_;
-  std::unique_ptr<char, std::function<void(char*)>> header_buffer_;
+  std::unique_ptr<char[]> header_buffer_;
   std::shared_ptr<TensorBuffer> tensor_buffer_;
   std::size_t blob_body_bytes_;
   std::unique_ptr<MemoryAllocator> non_pod_initer_;
diff --git a/oneflow/core/framework/nd_sbp.cpp b/oneflow/core/framework/nd_sbp.cpp
index d69ba0729a9..5640ad12910 100644
--- a/oneflow/core/framework/nd_sbp.cpp
+++ b/oneflow/core/framework/nd_sbp.cpp
@@ -22,20 +22,6 @@ namespace oneflow {
 
 namespace {
 
-Maybe<Symbol<cfg::NdSbp>> FindOrCreateNdSbp(const std::vector<Symbol<cfg::SbpParallel>>& sbp_list) {
-  static thread_local auto* sbp_list2nd_sbp =
-      new HashMap<std::vector<Symbol<cfg::SbpParallel>>, Symbol<cfg::NdSbp>>();
-  auto iter = sbp_list2nd_sbp->find(sbp_list);
-  if (iter == sbp_list2nd_sbp->end()) {
-    cfg::NdSbp nd_sbp;
-    for (Symbol<cfg::SbpParallel> sbp_symbol : sbp_list) {
-      *(nd_sbp.mutable_sbp_parallel()->Add()) = *sbp_symbol;
-    }
-    iter = sbp_list2nd_sbp->emplace(sbp_list, SymbolOf(nd_sbp)).first;
-  }
-  return iter->second;
-}
-
 Maybe<std::vector<std::string>> FindOrCreateNdSbpString(Symbol<cfg::NdSbp> nd_sbp) {
   static thread_local auto* nd_sbp2nd_sbp_str =
       new HashMap<Symbol<cfg::NdSbp>, std::shared_ptr<std::vector<std::string>>>();
@@ -81,10 +67,6 @@ Maybe<Symbol<cfg::NdSbp>> GetDualNdSbp(Symbol<cfg::NdSbp> nd_sbp) {
   return iter->second;
 }
 
-Maybe<Symbol<cfg::NdSbp>> GetNdSbp(const std::vector<Symbol<cfg::SbpParallel>>& sbp_list) {
-  return FindOrCreateNdSbp(sbp_list);
-}
-
 Maybe<std::vector<std::string>> GetNdSbpStrList(
     const std::vector<Symbol<cfg::SbpParallel>>& sbp_list) {
   return FindOrCreateNdSbpString(JUST(GetNdSbp(sbp_list)));
@@ -98,4 +80,54 @@ Maybe<std::vector<std::string>> GetDualNdSbpStrList(Symbol<cfg::NdSbp> nd_sbp) {
   return GetNdSbpStrList(JUST(GetDualNdSbp(nd_sbp)));
 }
 
+namespace private_details {
+
+Maybe<Symbol<cfg::NdSbp>> RawGetNdSbp(const std::vector<Symbol<cfg::SbpParallel>>& sbp_list) {
+  CHECK_OR_RETURN(!sbp_list.empty());
+  cfg::NdSbp nd_sbp;
+  for (const auto& sbp : sbp_list) { *(nd_sbp.mutable_sbp_parallel()->Add()) = *sbp; }
+  return SymbolOf(nd_sbp);
+}
+
+Maybe<std::vector<Symbol<cfg::SbpParallel>>> RawGetSbpList(Symbol<cfg::NdSbp> nd_sbp) {
+  const auto& vec = std::make_shared<std::vector<Symbol<cfg::SbpParallel>>>();
+  CHECK_OR_RETURN(!nd_sbp->sbp_parallel().empty());
+  for (const auto& sbp_parallel : nd_sbp->sbp_parallel()) {
+    vec->push_back(SymbolOf(sbp_parallel));
+  }
+  return vec;
+}
+
+}  // namespace private_details
+
+const std::vector<Symbol<cfg::SbpParallel>>& GetNoneSbpList() {
+  static thread_local std::vector<Symbol<cfg::SbpParallel>> none;
+  return none;
+}
+
+Maybe<std::string> SbpToString(Symbol<cfg::SbpParallel> sbp_sym) {
+  std::string sbp_str = "oneflow.sbp.";
+  if (sbp_sym->has_broadcast_parallel()) {
+    sbp_str += "broadcast";
+  } else if (sbp_sym->has_partial_sum_parallel()) {
+    sbp_str += "partial_sum";
+  } else if (sbp_sym->has_split_parallel()) {
+    sbp_str += "split(axis=" + std::to_string(sbp_sym->split_parallel().axis()) + ")";
+  } else {
+    UNIMPLEMENTED_THEN_RETURN();
+  }
+  return sbp_str;
+}
+
+Maybe<std::string> NdSbpToString(Symbol<cfg::NdSbp> nd_sbp) {
+  std::string str = "(";
+  for (int i = 0; i < nd_sbp->sbp_parallel_size(); ++i) {
+    if (i > 0) { str += ", "; }
+    str += *JUST(SbpToString(SymbolOf(nd_sbp->sbp_parallel(i))));
+  }
+  if (nd_sbp->sbp_parallel_size() == 1) { str += ","; }
+  str += ")";
+  return str;
+}
+
 }  // namespace oneflow
diff --git a/oneflow/core/framework/nd_sbp.h b/oneflow/core/framework/nd_sbp.h
index 5e34be0ee21..445caeafdee 100644
--- a/oneflow/core/framework/nd_sbp.h
+++ b/oneflow/core/framework/nd_sbp.h
@@ -17,15 +17,17 @@ limitations under the License.
 #define ONEFLOW_CORE_FRAMEWORK_ND_SBP_H_
 
 #include <vector>
+#include "oneflow/core/common/util.h"
 #include "oneflow/core/common/symbol.h"
 #include "oneflow/core/common/maybe.h"
+#include "oneflow/core/common/decorator.h"
 #include "oneflow/core/job/sbp_parallel.cfg.h"
 
 namespace oneflow {
 
-Maybe<Symbol<cfg::NdSbp>> GetDualNdSbp(Symbol<cfg::NdSbp> sbp_list);
+Maybe<Symbol<cfg::NdSbp>> GetDualNdSbp(Symbol<cfg::NdSbp> nd_sbp);
 
-Maybe<Symbol<cfg::NdSbp>> GetNdSbp(const std::vector<Symbol<cfg::SbpParallel>>& sbp_list);
+Maybe<Symbol<cfg::NdSbp>> GetDualNdSbp(Symbol<cfg::NdSbp> sbp_list);
 
 Maybe<std::vector<std::string>> GetNdSbpStrList(
     const std::vector<Symbol<cfg::SbpParallel>>& sbp_list);
@@ -33,6 +35,23 @@ Maybe<std::vector<std::string>> GetNdSbpStrList(
 Maybe<std::vector<std::string>> GetNdSbpStrList(Symbol<cfg::NdSbp> nd_sbp);
 
 Maybe<std::vector<std::string>> GetDualNdSbpStrList(Symbol<cfg::NdSbp> nd_sbp);
+
+Maybe<std::vector<std::string>> GetDualNdSbpStrList(Symbol<cfg::NdSbp> nd_sbp);
+
+namespace private_details {
+
+Maybe<Symbol<cfg::NdSbp>> RawGetNdSbp(const std::vector<Symbol<cfg::SbpParallel>>& sbp_list);
+Maybe<std::vector<Symbol<cfg::SbpParallel>>> RawGetSbpList(Symbol<cfg::NdSbp> nd_sbp);
+
+}  // namespace private_details
+
+static constexpr auto* GetNdSbp = DECORATE(&private_details::RawGetNdSbp, ThreadLocalCopiable);
+static constexpr auto* GetSbpList = DECORATE(&private_details::RawGetSbpList, ThreadLocal);
+const std::vector<Symbol<cfg::SbpParallel>>& GetNoneSbpList();
+
+Maybe<std::string> SbpToString(Symbol<cfg::SbpParallel> sbp_sym);
+Maybe<std::string> NdSbpToString(Symbol<cfg::NdSbp> nd_sbp);
+
 }  // namespace oneflow
 
 #endif  // ONEFLOW_CORE_FRAMEWORK_ND_SBP_H_
diff --git a/oneflow/core/framework/op_expr.cpp b/oneflow/core/framework/op_expr.cpp
index 5506f2494ac..42290c006b4 100644
--- a/oneflow/core/framework/op_expr.cpp
+++ b/oneflow/core/framework/op_expr.cpp
@@ -59,6 +59,11 @@ const std::string& BuiltinOpExprImpl<UserOpConf>::op_type_name() const {
   return op_proto_.op_type_name();
 }
 
+const std::string& ConsistentToConsistentOpExpr::op_type_name() const {
+  static const std::string kOpTypeName = "consistent_to_consistent";
+  return kOpTypeName;
+}
+
 const std::string& CastToConsistentOpExpr::op_type_name() const {
   static const std::string kOpTypeName = "cast_to_consistent";
   return kOpTypeName;
@@ -69,11 +74,6 @@ const std::string& CastFromConsistentOpExpr::op_type_name() const {
   return kOpTypeName;
 }
 
-const std::string& ConsistentToConsistentOpExpr::op_type_name() const {
-  static const std::string kOpTypeName = "consistent_to_consistent";
-  return kOpTypeName;
-}
-
 #define DEFINE_OPEXPR_IS_GRAD_DISABLED_DEFAULT_VALUE(_T, _bool) \
   template<>                                                    \
   Maybe<bool> BuiltinOpExprImpl<_T>::IsGradDisabled() const {   \
@@ -399,6 +399,16 @@ Maybe<Symbol<Device>> UserOpExpr::InferDevices(const AttrMap& attrs,
   return TRY(device_infer_fn_(&device_infer_ctx));
 }
 
+ConsistentToConsistentOpExpr::ConsistentToConsistentOpExpr(
+    const Optional<Symbol<cfg::NdSbp>>& grad_nd_sbp)
+    : grad_nd_sbp_(grad_nd_sbp) {}
+
+/* static */ Maybe<ConsistentToConsistentOpExpr> ConsistentToConsistentOpExpr::New(
+    const Optional<Symbol<cfg::NdSbp>>& grad_nd_sbp) {
+  auto* ptr = new ConsistentToConsistentOpExpr(grad_nd_sbp);
+  return std::shared_ptr<ConsistentToConsistentOpExpr>(ptr);
+}
+
 CastConsistentOpExpr::CastConsistentOpExpr(const std::string& op_name) : op_name_(op_name) {}
 
 CastToConsistentOpExpr::CastToConsistentOpExpr(const std::string& op_name)
@@ -416,14 +426,6 @@ CastFromConsistentOpExpr::CastFromConsistentOpExpr(const std::string& op_name)
   return std::shared_ptr<CastFromConsistentOpExpr>(new CastFromConsistentOpExpr(op_name));
 }
 
-ConsistentToConsistentOpExpr::ConsistentToConsistentOpExpr(const std::string& op_name)
-    : CastConsistentOpExpr(op_name) {}
-
-/* static */ Maybe<ConsistentToConsistentOpExpr> ConsistentToConsistentOpExpr::New(
-    const std::string& op_name) {
-  return std::shared_ptr<ConsistentToConsistentOpExpr>(new ConsistentToConsistentOpExpr(op_name));
-}
-
 template<>
 Maybe<void> BuiltinOpExprImpl<FeedInputOpConf>::BuildOpConf(OperatorConf* op_conf,
                                                             const AttrMap& attrs) const {
@@ -509,6 +511,15 @@ Maybe<OpExprGradClosure> BuiltinOpExprImpl<CastFromMirroredOpConf>::GetOrCreateO
   UNIMPLEMENTED_THEN_RETURN();
 }
 
+Maybe<OpExprGradClosure> ConsistentToConsistentOpExpr::GetOrCreateOpGradClosure() const {
+  if (!op_grad_func_.get()) {
+    op_grad_func_.reset(NewObj<std::string, OpExprGradFunctionIf>("consistent_to_consistent"));
+    CHECK_NOTNULL_OR_RETURN(op_grad_func_.get());
+    JUST(op_grad_func_->Init(*this));
+  }
+  return std::make_shared<OpExprGradClosure>(op_grad_func_);
+}
+
 Maybe<OpExprGradClosure> CastToConsistentOpExpr::GetOrCreateOpGradClosure() const {
   if (!op_grad_func_.get()) {
     op_grad_func_.reset(NewObj<std::string, OpExprGradFunctionIf>("cast_to_consistent"));
diff --git a/oneflow/core/framework/op_expr.h b/oneflow/core/framework/op_expr.h
index 8e7d139beda..7238edd303b 100644
--- a/oneflow/core/framework/op_expr.h
+++ b/oneflow/core/framework/op_expr.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/common/symbol.h"
+#include "oneflow/core/common/optional.h"
+#include "oneflow/core/job/sbp_parallel.cfg.h"
 #include "oneflow/core/operator/op_conf.pb.h"
 #include "oneflow/core/framework/attr_map.h"
 #include "oneflow/core/framework/device.h"
@@ -155,6 +157,27 @@ class UserOpExpr final : public BuiltinOpExprImpl<UserOpConf> {
   std::shared_ptr<ConsistentTensorInferCache> consistent_tensor_infer_cache_;
 };
 
+class ConsistentToConsistentOpExpr : public OpExpr {
+ public:
+  virtual ~ConsistentToConsistentOpExpr() = default;
+
+  static Maybe<ConsistentToConsistentOpExpr> New(const Optional<Symbol<cfg::NdSbp>>& grad_nd_sbp);
+
+  const Optional<Symbol<cfg::NdSbp>>& grad_nd_sbp() const { return grad_nd_sbp_; }
+  const std::string& op_type_name() const override;
+  int input_size() const override { return 1; }
+  int output_size() const override { return 1; }
+
+  Maybe<bool> IsGradDisabled() const override { return false; }
+  Maybe<OpExprGradClosure> GetOrCreateOpGradClosure() const override;
+
+ protected:
+  ConsistentToConsistentOpExpr(const Optional<Symbol<cfg::NdSbp>>& grad_nd_sbp);
+
+  Optional<Symbol<cfg::NdSbp>> grad_nd_sbp_;  //  Reserved for configuring grad sbp
+  mutable std::shared_ptr<OpExprGradFunctionIf> op_grad_func_;
+};
+
 class CastConsistentOpExpr : public OpExpr {
  public:
   virtual ~CastConsistentOpExpr() = default;
@@ -198,25 +221,6 @@ class CastFromConsistentOpExpr final : public CastConsistentOpExpr {
   CastFromConsistentOpExpr(const std::string& op_name);
 };
 
-class ConsistentToConsistentOpExpr final : public CastConsistentOpExpr {
- public:
-  ~ConsistentToConsistentOpExpr() = default;
-
-  static Maybe<ConsistentToConsistentOpExpr> New(const std::string& op_name);
-
-  const std::string& op_type_name() const override;
-
-  // Note(zwx): ConsistentToConsistentOpExpr is currently only used by lazy,
-  //     there's no need to gen grad through autograd engine
-  Maybe<bool> IsGradDisabled() const override { return true; }
-  Maybe<OpExprGradClosure> GetOrCreateOpGradClosure() const override {
-    UNIMPLEMENTED_THEN_RETURN();
-  }
-
- private:
-  ConsistentToConsistentOpExpr(const std::string& op_name);
-};
-
 // NOTE(chengcheng): For Lazy nn.Graph Feed/Fetch EagerTensor to/from LazyTensor.
 using FeedInputOpExpr = BuiltinOpExprImpl<FeedInputOpConf>;
 using FeedVariableOpExpr = BuiltinOpExprImpl<FeedVariableOpConf>;
diff --git a/oneflow/core/framework/op_interpreter.h b/oneflow/core/framework/op_interpreter.h
index 71ea6b844d4..47a60ae1b5d 100644
--- a/oneflow/core/framework/op_interpreter.h
+++ b/oneflow/core/framework/op_interpreter.h
@@ -83,6 +83,7 @@ class OpExprInterpreter {
   _macro(VariableOp);                \
   _macro(CastToMirroredOp);          \
   _macro(CastFromMirroredOp);        \
+  _macro(ConsistentToConsistentOp);  \
   _macro(CastToConsistentOp);        \
   _macro(CastFromConsistentOp);      \
   _macro(DistributeSplitOp);         \
diff --git a/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.cpp
new file mode 100644
index 00000000000..071bc860b51
--- /dev/null
+++ b/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.cpp
@@ -0,0 +1,63 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h"
+#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.h"
+#include "oneflow/core/job/parallel_desc.h"
+
+namespace oneflow {
+
+struct CudaBaseMpiEagerBoxingCall {
+  std::shared_ptr<EagerBoxingCall> opt_h2d;
+  std::shared_ptr<EagerBoxingCall> gpu_mpi;
+  std::shared_ptr<EagerBoxingCall> opt_d2h;
+};
+
+Maybe<CudaBaseMpiEagerBoxingCall> RawGetCudaBaseMpiEagerBoxingCall(
+    Symbol<cfg::NdSbp> in_nd_sbp, Symbol<cfg::NdSbp> out_nd_sbp,
+    Symbol<ParallelDesc> in_parallel_desc, Symbol<ParallelDesc> out_parallel_desc) {
+  const auto& gpu_in_parallel_desc = JUST(ReplaceDeviceType(in_parallel_desc, DeviceType::kGPU));
+  const auto& gpu_out_parallel_desc = JUST(ReplaceDeviceType(out_parallel_desc, DeviceType::kGPU));
+  CHECK_OR_RETURN(gpu_in_parallel_desc == gpu_out_parallel_desc);
+  const auto& opt_h2d =
+      JUST(EagerBoxingCall::New(in_nd_sbp, in_nd_sbp, in_parallel_desc, gpu_in_parallel_desc));
+  const auto& gpu_mpi = JUST(
+      EagerBoxingCall::New(in_nd_sbp, out_nd_sbp, gpu_in_parallel_desc, gpu_out_parallel_desc));
+  const auto& opt_d2h =
+      JUST(EagerBoxingCall::New(out_nd_sbp, out_nd_sbp, gpu_out_parallel_desc, out_parallel_desc));
+  return std::shared_ptr<CudaBaseMpiEagerBoxingCall>(new CudaBaseMpiEagerBoxingCall{
+      .opt_h2d = opt_h2d,
+      .gpu_mpi = gpu_mpi,
+      .opt_d2h = opt_d2h,
+  });
+}
+
+static constexpr auto* GetCudaBaseMpiEagerBoxingCall =
+    DECORATE(&RawGetCudaBaseMpiEagerBoxingCall, ThreadLocal);
+
+Maybe<one::Tensor> CudaBasedCpuMpiBoxingInterpreter::InterpretImpl(
+    const std::shared_ptr<one::Tensor>& input, Symbol<cfg::NdSbp> in_nd_sbp,
+    Symbol<cfg::NdSbp> out_nd_sbp, Symbol<ParallelDesc> in_parallel_desc,
+    Symbol<ParallelDesc> out_parallel_desc) const {
+  const auto& call = JUST(
+      GetCudaBaseMpiEagerBoxingCall(in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc));
+  auto tensor = input;
+  tensor = JUST(call->opt_h2d->Apply(tensor));
+  tensor = JUST(call->gpu_mpi->Apply(tensor));
+  tensor = JUST(call->opt_d2h->Apply(tensor));
+  return tensor;
+}
+
+}  // namespace oneflow
diff --git a/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h b/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h
new file mode 100644
index 00000000000..b2bd02dc8f5
--- /dev/null
+++ b/oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h
@@ -0,0 +1,37 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_BASED_CPU_MPI_BOXING_INTERPRETER_H_
+#define ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_BASED_CPU_MPI_BOXING_INTERPRETER_H_
+
+#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h"
+
+namespace oneflow {
+
+class CudaBasedCpuMpiBoxingInterpreter final : public EagerBoxingInterpreter {
+ public:
+  CudaBasedCpuMpiBoxingInterpreter() = default;
+  ~CudaBasedCpuMpiBoxingInterpreter() override = default;
+
+ private:
+  Maybe<one::Tensor> InterpretImpl(const std::shared_ptr<one::Tensor>& input,
+                                   Symbol<cfg::NdSbp> in_nd_sbp, Symbol<cfg::NdSbp> out_nd_sbp,
+                                   Symbol<ParallelDesc> in_parallel_desc,
+                                   Symbol<ParallelDesc> out_parallel_desc) const override;
+};
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_BASED_CPU_MPI_BOXING_INTERPRETER_H_
diff --git a/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.cpp
new file mode 100644
index 00000000000..67c61f95bbb
--- /dev/null
+++ b/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.cpp
@@ -0,0 +1,41 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/framework/nd_sbp.h"
+#include "oneflow/core/job/parallel_desc.h"
+
+namespace oneflow {
+
+Maybe<one::Tensor> CudaCopyBoxingInterpreter::InterpretImpl(
+    const std::shared_ptr<one::Tensor>& input, Symbol<cfg::NdSbp> in_nd_sbp,
+    Symbol<cfg::NdSbp> out_nd_sbp, Symbol<ParallelDesc> in_parallel_desc,
+    Symbol<ParallelDesc> out_parallel_desc) const {
+  CHECK_OR_RETURN(in_nd_sbp == out_nd_sbp);
+  const auto& new_tag_in_parallel_desc =
+      JUST(ReplaceDeviceType(in_parallel_desc, out_parallel_desc->device_type()));
+  CHECK_OR_RETURN(new_tag_in_parallel_desc == out_parallel_desc);
+  const auto& local_tensor = JUST(input->cur_rank_phy_tensor());
+  const auto& sbp_list = JUST(GetSbpList(out_nd_sbp));
+  const auto& tensor =
+      JUST(one::functional::ToConsistent(local_tensor, out_parallel_desc, *sbp_list, {}));
+  CHECK_OR_RETURN(tensor->is_consistent());
+  const auto& tensor_placement = JUST(tensor->parallel_desc());
+  CHECK_OR_RETURN(tensor_placement == out_parallel_desc);
+  return tensor;
+}
+
+}  // namespace oneflow
diff --git a/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h b/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h
new file mode 100644
index 00000000000..c99c4f7762d
--- /dev/null
+++ b/oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h
@@ -0,0 +1,36 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_COPY_BOXING_INTERPRETER_H_
+#define ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_COPY_BOXING_INTERPRETER_H_
+
+#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h"
+
+namespace oneflow {
+
+class CudaCopyBoxingInterpreter : public EagerBoxingInterpreter {
+ public:
+  CudaCopyBoxingInterpreter() = default;
+  ~CudaCopyBoxingInterpreter() override = default;
+
+  Maybe<one::Tensor> InterpretImpl(const std::shared_ptr<one::Tensor>& input,
+                                   Symbol<cfg::NdSbp> in_nd_sbp, Symbol<cfg::NdSbp> out_nd_sbp,
+                                   Symbol<ParallelDesc> in_parallel_desc,
+                                   Symbol<ParallelDesc> out_parallel_desc) const override;
+};
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_CUDA_COPY_BOXING_INTERPRETER_H_
diff --git a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.cpp
new file mode 100644
index 00000000000..1b8083cfedc
--- /dev/null
+++ b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.cpp
@@ -0,0 +1,70 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <typeinfo>
+#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h"
+#include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.h"
+
+namespace oneflow {
+
+namespace {
+Maybe<void> CheckEagerBoxingDataType(DataType val) {
+  CHECK_OR_RETURN(val != DataType::kTensorBuffer && val != DataType::kOFRecord)
+      << "EagerBoxing only support POD data type.";
+  return Maybe<void>::Ok();
+}
+}  // namespace
+
+Maybe<one::Tensor> EagerBoxingInterpreter::Interpret(const std::shared_ptr<one::Tensor>& input,
+                                                     Symbol<cfg::NdSbp> in_nd_sbp,
+                                                     Symbol<cfg::NdSbp> out_nd_sbp,
+                                                     Symbol<ParallelDesc> in_parallel_desc,
+                                                     Symbol<ParallelDesc> out_parallel_desc) const {
+  JUST(CheckEagerBoxingDataType(input->dtype()->data_type()));
+  const auto& tensor =
+      JUST(InterpretImpl(input, in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc));
+  const auto& tensor_nd_sbp = JUST(tensor->nd_sbp());
+  const auto& tensor_placement = JUST(tensor->parallel_desc());
+  CHECK_OR_RETURN(tensor_nd_sbp == out_nd_sbp) << typeid(*this).name();
+  CHECK_OR_RETURN(tensor_placement == out_parallel_desc) << typeid(*this).name();
+  return tensor;
+}
+
+Maybe<EagerBoxingCall> EagerBoxingCall::New(Symbol<cfg::NdSbp> in_nd_sbp,
+                                            Symbol<cfg::NdSbp> out_nd_sbp,
+                                            Symbol<ParallelDesc> in_parallel_desc,
+                                            Symbol<ParallelDesc> out_parallel_desc) {
+  const auto* mgr = Global<EagerBoxingInterpreterManager>::Get();
+  const auto& boxing_interpreter = JUST(
+      mgr->GetEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc));
+  return std::shared_ptr<EagerBoxingCall>(new EagerBoxingCall{
+      .boxing_interpreter = boxing_interpreter,
+      .in_nd_sbp = in_nd_sbp,
+      .out_nd_sbp = out_nd_sbp,
+      .in_parallel_desc = in_parallel_desc,
+      .out_parallel_desc = out_parallel_desc,
+  });
+}
+
+Maybe<one::Tensor> EagerBoxingCall::Apply(const std::shared_ptr<one::Tensor>& input) const {
+  const auto& input_nd_sbp = JUST(input->nd_sbp());
+  const auto& input_parallel_desc = JUST(input->parallel_desc());
+  CHECK_OR_RETURN(input_nd_sbp == this->in_nd_sbp);
+  CHECK_OR_RETURN(input_parallel_desc == this->in_parallel_desc);
+  return this->boxing_interpreter->Interpret(input, this->in_nd_sbp, this->out_nd_sbp,
+                                             this->in_parallel_desc, this->out_parallel_desc);
+}
+
+}  // namespace oneflow
diff --git a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h
index 155a3fbef3d..9044d8d45ec 100644
--- a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h
+++ b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter.h
@@ -24,14 +24,6 @@ limitations under the License.
 
 namespace oneflow {
 
-namespace {
-inline Maybe<void> CheckEagerBoxingDataType(DataType val) {
-  CHECK_OR_RETURN(val != DataType::kTensorBuffer && val != DataType::kOFRecord)
-      << "EagerBoxing only support POD data type.";
-  return Maybe<void>::Ok();
-}
-}  // namespace
-
 class EagerBoxingInterpreter {
  public:
   OF_DISALLOW_COPY_AND_MOVE(EagerBoxingInterpreter);
@@ -41,10 +33,7 @@ class EagerBoxingInterpreter {
   Maybe<one::Tensor> Interpret(const std::shared_ptr<one::Tensor>& input,
                                Symbol<cfg::NdSbp> in_nd_sbp, Symbol<cfg::NdSbp> out_nd_sbp,
                                Symbol<ParallelDesc> in_parallel_desc,
-                               Symbol<ParallelDesc> out_parallel_desc) {
-    JUST(CheckEagerBoxingDataType(input->dtype()->data_type()));
-    return InterpretImpl(input, in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc);
-  }
+                               Symbol<ParallelDesc> out_parallel_desc) const;
 
  protected:
   virtual Maybe<one::Tensor> InterpretImpl(const std::shared_ptr<one::Tensor>& input,
@@ -54,6 +43,20 @@ class EagerBoxingInterpreter {
                                            Symbol<ParallelDesc> out_parallel_desc) const = 0;
 };
 
+struct EagerBoxingCall {
+  static Maybe<EagerBoxingCall> New(Symbol<cfg::NdSbp> in_nd_sbp, Symbol<cfg::NdSbp> out_nd_sbp,
+                                    Symbol<ParallelDesc> in_parallel_desc,
+                                    Symbol<ParallelDesc> out_parallel_desc);
+
+  Maybe<one::Tensor> Apply(const std::shared_ptr<one::Tensor>& input) const;
+
+  const std::shared_ptr<const EagerBoxingInterpreter> boxing_interpreter;
+  const Symbol<cfg::NdSbp> in_nd_sbp;
+  const Symbol<cfg::NdSbp> out_nd_sbp;
+  const Symbol<ParallelDesc> in_parallel_desc;
+  const Symbol<ParallelDesc> out_parallel_desc;
+};
+
 }  // namespace oneflow
 
 #endif  // ONEFLOW_CORE_FRAMEWORK_OP_INTERPRETER_BOXING_EAGER_BOXING_INTERPRETER_H_
diff --git a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.cpp b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.cpp
index 8ccd7428f93..186dc40d0e4 100644
--- a/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.cpp
+++ b/oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.cpp
@@ -17,13 +17,15 @@ limitations under the License.
 #include "oneflow/core/common/constant.h"
 #include "oneflow/core/common/decorator.h"
 #include "oneflow/core/common/container_util.h"
-#include "oneflow/core/job/sbp_parallel.h"
+#include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_mgr.h"
 #include "oneflow/core/framework/op_interpreter/boxing/eager_boxing_interpreter_util.h"
 #include "oneflow/core/framework/op_interpreter/boxing/collective_boxing_interpreter.h"
 #include "oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.h"
 #include "oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.h"
 #include "oneflow/core/framework/op_interpreter/boxing/naive_s2p_boxing_interpreter.h"
+#include "oneflow/core/framework/op_interpreter/boxing/cuda_copy_boxing_interpreter.h"
+#include "oneflow/core/framework/op_interpreter/boxing/cuda_based_cpu_mpi_boxing_interpreter.h"
 
 namespace oneflow {
 
@@ -31,18 +33,6 @@ namespace {
 using SbpPair2EagerBoxingInterpreter =
     HashMap<std::pair<cfg::SbpParallel, cfg::SbpParallel>, std::shared_ptr<EagerBoxingInterpreter>>;
 
-std::string GetSupportedBoxingTypeInfo() {
-  static std::string supported_boxing_type_info =
-      "============ Supported eager boxing type============\n"
-      "\'[S(0)] -> [B]\' on GPU\n"
-      "\'[S(0)] -> [P]\' on GPU\n"
-      "\'[P] -> [B]\' on GPU\n"
-      "\'[P] -> [S(0)]\' on GPU\n"
-      "\'[B] -> [S(0)]\' on GPU\n"
-      "\'[B] -> [P]\' on GPU or CPU";
-  return supported_boxing_type_info;
-}
-
 Maybe<EagerBoxingInterpreter> GetOneDimNcclCollectiveEagerBoxingInterpreter(
     Symbol<cfg::NdSbp> in_nd_sbp, Symbol<cfg::NdSbp> out_nd_sbp) {
   static SbpPair2EagerBoxingInterpreter sbp_pair2eager_boxing_interpreter = {
@@ -58,54 +48,83 @@ Maybe<EagerBoxingInterpreter> GetOneDimNcclCollectiveEagerBoxingInterpreter(
        std::make_shared<NcclS2PBoxingInterpreter>()},
   };
   const auto& key = std::make_pair(in_nd_sbp->sbp_parallel(0), out_nd_sbp->sbp_parallel(0));
-  CHECK_OR_RETURN(sbp_pair2eager_boxing_interpreter.find(key)
-                  != sbp_pair2eager_boxing_interpreter.end())
-      << "Eager boxing type \'" << NdSbpToString(in_nd_sbp) << " -> " << NdSbpToString(out_nd_sbp)
-      << "\'"
-      << " not support yet\n"
-      << GetSupportedBoxingTypeInfo();
-
   return JUST(MapAt(sbp_pair2eager_boxing_interpreter, key));
 }
 
+Maybe<EagerBoxingInterpreter> GetCudaBasedCpuMpiBoxingInterpreter(
+    Symbol<cfg::NdSbp> in_nd_sbp, Symbol<cfg::NdSbp> out_nd_sbp,
+    Symbol<ParallelDesc> in_parallel_desc, Symbol<ParallelDesc> out_parallel_desc) {
+  CHECK_OR_RETURN(in_nd_sbp != out_nd_sbp);
+  const auto& gpu_in_parallel_desc = JUST(ReplaceDeviceType(in_parallel_desc, DeviceType::kGPU));
+  const auto& gpu_out_parallel_desc = JUST(ReplaceDeviceType(out_parallel_desc, DeviceType::kGPU));
+  CHECK_OR_RETURN(gpu_in_parallel_desc == gpu_out_parallel_desc);
+  const auto& gpu_boxing_interpreter =
+      JUST(GetOneDimNcclCollectiveEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp));
+  return std::shared_ptr<EagerBoxingInterpreter>(new CudaBasedCpuMpiBoxingInterpreter());
+}
+
+Maybe<bool> IgnoringDeviceTypeEqual(Symbol<ParallelDesc> lhs, Symbol<ParallelDesc> rhs) {
+  if (lhs == rhs) { return true; }
+  return lhs == JUST(ReplaceDeviceType(rhs, lhs->device_type()));
+}
+
 Maybe<EagerBoxingInterpreter> GetBoxingInterpreter(Symbol<cfg::NdSbp> in_nd_sbp,
                                                    Symbol<cfg::NdSbp> out_nd_sbp,
                                                    Symbol<ParallelDesc> in_parallel_desc,
                                                    Symbol<ParallelDesc> out_parallel_desc) {
   if (in_parallel_desc == out_parallel_desc
       && (in_parallel_desc->parallel_num() == 1 || in_nd_sbp == out_nd_sbp)) {
-    static std::shared_ptr<EagerBoxingInterpreter> identity_boxing_interpreter =
-        std::make_shared<IdentityBoxingInterpreter>();
-    return identity_boxing_interpreter;
+    return std::shared_ptr<EagerBoxingInterpreter>(new IdentityBoxingInterpreter());
+  }
+  if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1
+      && in_parallel_desc == out_parallel_desc
+      && EagerBoxingInterpreterUtil::IsBoxingB2P(in_nd_sbp->sbp_parallel(0),
+                                                 out_nd_sbp->sbp_parallel(0))) {
+    return std::shared_ptr<EagerBoxingInterpreter>(new NaiveB2PBoxingInterpreter());
+  }
+  if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1
+      && in_parallel_desc == out_parallel_desc
+      && in_parallel_desc->device_type() == DeviceType::kGPU) {
+    const auto& gpu_boxing_interpreter =
+        TRY(GetOneDimNcclCollectiveEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp));
+    if (gpu_boxing_interpreter.IsOk()) { return JUST(gpu_boxing_interpreter); }
+  }
+  if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1
+      && in_parallel_desc == out_parallel_desc
+      && in_parallel_desc->device_type() == DeviceType::kCPU) {
+    const auto& interpreter = TRY(GetCudaBasedCpuMpiBoxingInterpreter(
+        in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc));
+    if (interpreter.IsOk()) { return JUST(interpreter); }
+  }
+  if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1
+      && JUST(IgnoringDeviceTypeEqual(in_parallel_desc, out_parallel_desc))
+      && ((in_parallel_desc->device_type() == DeviceType::kGPU
+           && out_parallel_desc->device_type() == DeviceType::kCPU)
+          || (in_parallel_desc->device_type() == DeviceType::kCPU
+              && out_parallel_desc->device_type() == DeviceType::kGPU))
+      && in_nd_sbp == out_nd_sbp) {
+    return std::shared_ptr<EagerBoxingInterpreter>(new CudaCopyBoxingInterpreter());
   }
-  if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1) {
-    if (in_parallel_desc == out_parallel_desc) {
-      if (EagerBoxingInterpreterUtil::IsBoxingB2P(in_nd_sbp->sbp_parallel(0),
-                                                  out_nd_sbp->sbp_parallel(0))) {
-        std::shared_ptr<EagerBoxingInterpreter> naive_bp_boxing_interpreter =
-            std::make_shared<NaiveB2PBoxingInterpreter>();
-        return naive_bp_boxing_interpreter;
-      } else if (in_parallel_desc->device_type() == DeviceType::kGPU) {
-        return GetOneDimNcclCollectiveEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp);
-      } else {
-        UNIMPLEMENTED_THEN_RETURN() << "Eager boxing type \'" << NdSbpToString(in_nd_sbp) << " -> "
-                                    << NdSbpToString(out_nd_sbp) << "\'"
-                                    << " not support yet\n"
-                                    << GetSupportedBoxingTypeInfo();
-      }
-    } else {
-      UNIMPLEMENTED_THEN_RETURN() << "Eager boxing with different placement not support yet\n"
-                                  << GetSupportedBoxingTypeInfo();
-    }
-  } else {
-    UNIMPLEMENTED_THEN_RETURN() << "N-dim eager boxing type \'" << NdSbpToString(in_nd_sbp)
-                                << " -> " << NdSbpToString(out_nd_sbp) << "\'"
-                                << " not support yet\n"
-                                << GetSupportedBoxingTypeInfo();
+  if (in_nd_sbp->sbp_parallel_size() == 1 && out_nd_sbp->sbp_parallel_size() == 1
+      && JUST(IgnoringDeviceTypeEqual(in_parallel_desc, out_parallel_desc))
+      && ((in_parallel_desc->device_type() == DeviceType::kGPU
+           && out_parallel_desc->device_type() == DeviceType::kCPU)
+          || (in_parallel_desc->device_type() == DeviceType::kCPU
+              && out_parallel_desc->device_type() == DeviceType::kGPU))
+      && in_nd_sbp != out_nd_sbp) {
+    const auto& interpreter = TRY(GetCudaBasedCpuMpiBoxingInterpreter(
+        in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc));
+    if (interpreter.IsOk()) { return JUST(interpreter); }
   }
+  UNIMPLEMENTED_THEN_RETURN() << Error::BoxingNotSupportedError()
+                              << "consistent-to-consistent not supported"
+                              << ". from_nd_sbp: " << *JUST(NdSbpToString(in_nd_sbp))
+                              << ", to_nd_sbp: " << *JUST(NdSbpToString(out_nd_sbp))
+                              << ", from_placement: " << *JUST(PlacementToString(in_parallel_desc))
+                              << ", to_placement: " << *JUST(PlacementToString(out_parallel_desc));
 }
 
-auto* CachedGetBoxingInterpreter = DECORATE(&GetBoxingInterpreter, ThreadLocal);
+static constexpr auto* CachedGetBoxingInterpreter = DECORATE(&GetBoxingInterpreter, ThreadLocal);
 
 }  // namespace
 
diff --git a/oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.cpp
index 06a32dbd0f8..13cca74c95b 100644
--- a/oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.cpp
+++ b/oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.cpp
@@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/framework/op_interpreter/boxing/identity_boxing_interpreter.h"
 
 namespace oneflow {
@@ -23,7 +25,10 @@ Maybe<one::Tensor> IdentityBoxingInterpreter::InterpretImpl(
     Symbol<ParallelDesc> out_parallel_desc) const {
   CHECK_OR_RETURN(in_parallel_desc == out_parallel_desc);
   CHECK_OR_RETURN(in_parallel_desc->parallel_num() == 1 || in_nd_sbp == out_nd_sbp);
-  return input;
+  // reset sbp if parallel_num == 1 and reset ConsistentId
+  std::shared_ptr<one::Tensor> tensor = JUST(input->cur_rank_phy_tensor());
+  return one::functional::ToConsistent(tensor, out_parallel_desc, *JUST(GetSbpList(out_nd_sbp)),
+                                       GetNoneSbpList());
 }
 
 }  // namespace oneflow
diff --git a/oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.cpp b/oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.cpp
index e37c7875bab..1034c5eb830 100644
--- a/oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.cpp
+++ b/oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.cpp
@@ -15,6 +15,7 @@ limitations under the License.
 */
 #include "oneflow/core/framework/op_interpreter/boxing/naive_b2p_boxing_interpreter.h"
 #include "oneflow/core/framework/device.h"
+#include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/job/global_for.h"
 #include "oneflow/core/job/resource_desc.h"
 #include "oneflow/core/control/global_process_ctx.h"
@@ -28,11 +29,14 @@ Maybe<one::Tensor> NaiveB2PBoxingInterpreter::InterpretImpl(
     Symbol<ParallelDesc> out_parallel_desc) const {
   CHECK_OR_RETURN(in_parallel_desc == out_parallel_desc);
   int64_t root = JUST(in_parallel_desc->MachineId4ParallelId(0));
+  std::shared_ptr<one::Tensor> tensor = JUST(input->cur_rank_phy_tensor());
   if (root == GlobalProcessCtx::Rank()) {
-    return JUST(one::functional::Identity(input));
+    // do nothing
   } else {
-    return JUST(one::functional::ZerosLike(input));
+    tensor = JUST(one::functional::ZerosLike(tensor));
   }
+  return one::functional::ToConsistent(tensor, out_parallel_desc, *JUST(GetSbpList(out_nd_sbp)),
+                                       GetNoneSbpList());
 }
 
 }  // namespace oneflow
diff --git a/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp
index d904c92cb96..93fb9479817 100644
--- a/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp
+++ b/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp
@@ -34,6 +34,7 @@ limitations under the License.
 #include "oneflow/user/kernels/stateful_local_opkernel.h"
 #include "oneflow/core/framework/tensor_rpc_util.h"
 #include "oneflow/core/framework/tensor_consistent_id.h"
+#include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/common/decorator.h"
 
 namespace oneflow {
@@ -65,13 +66,11 @@ std::string GetDynamicOpConsistentFailedDebugString(const UserOpExpr& user_op_ex
 }
 
 Maybe<Tensor> CalcBoxingOutput(const std::shared_ptr<Tensor>& input, Symbol<cfg::NdSbp> out_nd_sbp,
-                               bool current_rank_local_is_valid) {
-  if (!current_rank_local_is_valid) { return input; }
+                               Symbol<ParallelDesc> out_parallel_desc) {
   const auto* mgr = Global<EagerBoxingInterpreterManager>::Get();
   // Eager boxing
   const auto& in_nd_sbp = JUST(input->nd_sbp());
   const auto& in_parallel_desc = JUST(input->parallel_desc());
-  const auto& out_parallel_desc = in_parallel_desc;
   const auto& boxing_interpreter = JUST(
       mgr->GetEagerBoxingInterpreter(in_nd_sbp, out_nd_sbp, in_parallel_desc, out_parallel_desc));
   const auto& output = JUST(boxing_interpreter->Interpret(input, in_nd_sbp, out_nd_sbp,
@@ -112,8 +111,11 @@ Maybe<void> Interpret(const UserOpExpr& user_op_expr, const TensorTuple& inputs,
   for (int i = 0; i < inputs.size(); ++i) {
     std::shared_ptr<Tensor> input = inputs.at(i);
     const auto& infered_input_meta = result->input_tensor_metas().at(i);
+    const auto& input_parallel_desc = JUST(input->parallel_desc());
+    CHECK_OR_RETURN(input_parallel_desc == infered_input_meta->parallel_desc());
     if (infered_input_meta->nd_sbp() != JUST(input->nd_sbp())) {
-      input = JUST(GetBoxingOutput(input, infered_input_meta->nd_sbp(), parallel_id.has_value()));
+      input = JUST(GetBoxingOutput(input, infered_input_meta->nd_sbp(),
+                                   infered_input_meta->parallel_desc()));
     }
     const auto& local_tensor = JUST(input->cur_rank_phy_tensor());
     input_eager_blob_objects->at(i) = JUST(local_tensor->eager_blob_object());
@@ -151,6 +153,57 @@ Maybe<void> EagerConsistentInterpreter::ApplyImpl(const VariableOpExpr& op_expr,
   OF_UNIMPLEMENTED();
 }
 
+namespace {
+
+static constexpr auto* RecursiveGetBoxingOutput =
+    DECORATE(&CalcBoxingOutput, CheckConsistentTensorMeta);
+
+Maybe<void> RawConsistentToConsistent(const ConsistentToConsistentOpExpr& op_expr,
+                                      const TensorTuple& inputs, TensorTuple* outputs,
+                                      const OpExprInterpContext& ctx) {
+  CHECK_EQ_OR_RETURN(inputs.size(), 1);
+  CHECK_EQ_OR_RETURN(outputs->size(), 1);
+  const auto& input = inputs.at(0);
+  CHECK_OR_RETURN(input->is_consistent());
+  CHECK_OR_RETURN(ctx.parallel_desc.has_value());
+  CHECK_OR_RETURN(ctx.nd_sbp.has_value());
+  const auto& in_parallel_desc = JUST(input->parallel_desc());
+  const auto& out_nd_sbp = JUST(ctx.nd_sbp.value());
+  const auto& out_parallel_desc = JUST(ctx.parallel_desc.value());
+  const auto& in_parallel_id = JUST(GetParallelId4CurrentProcessCtx(in_parallel_desc));
+  const auto& out_parallel_id = JUST(GetParallelId4CurrentProcessCtx(out_parallel_desc));
+  const auto& tensor = JUST(RecursiveGetBoxingOutput(input, out_nd_sbp, out_parallel_desc));
+  CHECK_OR_RETURN(tensor);
+  if (out_parallel_id->has_value()) {
+    const auto& nd_sbp = JUST(tensor->nd_sbp());
+    const auto& parallel_desc = JUST(tensor->parallel_desc());
+    CHECK_OR_RETURN(nd_sbp == out_nd_sbp) << ". nd_sbp: " << *JUST(NdSbpToString(nd_sbp))
+                                          << ", out_nd_sbp" << *JUST(NdSbpToString(out_nd_sbp));
+    CHECK_OR_RETURN(parallel_desc == out_parallel_desc);
+    outputs->at(0) = tensor;
+  } else {
+    ConsistentTensorMeta tensor_meta(tensor->shape(), tensor->dtype()->data_type(), out_nd_sbp,
+                                     out_parallel_desc);
+    const auto& tensor_impl =
+        JUST(EagerConsistentTensorImpl::New(SymbolOf(tensor_meta), tensor->requires_grad(), false));
+    outputs->at(0).reset(new ConsistentTensor(tensor_impl));
+  }
+  CHECK_OR_RETURN(outputs->at(0));
+  return Maybe<void>::Ok();
+}
+
+static constexpr auto* ConsistentToConsistent =
+    DECORATE(&RawConsistentToConsistent, NonRecursiveInitConsistentId);
+
+}  // namespace
+
+Maybe<void> EagerConsistentInterpreter::ApplyImpl(const ConsistentToConsistentOpExpr& op_expr,
+                                                  const TensorTuple& inputs, TensorTuple* outputs,
+                                                  const OpExprInterpContext& ctx) const {
+  JUST(ConsistentToConsistent(op_expr, inputs, outputs, ctx));
+  return Maybe<void>::Ok();
+}
+
 Maybe<void> EagerConsistentInterpreter::ApplyImpl(const CastToConsistentOpExpr& op_expr,
                                                   const TensorTuple& inputs, TensorTuple* outputs,
                                                   const OpExprInterpContext& ctx) const {
diff --git a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp
index 36ca96960d1..2427d4c5f35 100644
--- a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp
+++ b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/common/symbol.h"
+#include "oneflow/core/common/decorator.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/op_interpreter.h"
 #include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "oneflow/core/autograd/autograd_mode.h"
 #include "oneflow/core/framework/placement_sbp_util.h"
 #include "oneflow/core/framework/tensor_rpc_util.h"
+#include "oneflow/core/framework/tensor_consistent_id.h"
 #include "oneflow/core/framework/op_builder.h"
 #include "oneflow/core/framework/id_util.h"
 #include "oneflow/core/functional/functional.h"
@@ -249,29 +251,36 @@ Maybe<one::UserOpExpr> FindOrCreatEagerNcclBroadcastOpExpr(Symbol<ParallelDesc>
   }
   return iter->second;
 }
+}  // namespace
 
-Maybe<Tensor> GetSyncedTensorIfBroadcast(const std::shared_ptr<Tensor>& tensor,
-                                         Symbol<ParallelDesc> parallel_desc,
-                                         Symbol<cfg::NdSbp> nd_sbp) {
-  Optional<int64_t> parallel_id;
-  JUST(GetDevice4CurrentProcessCtx(parallel_desc, &parallel_id));
-  if (!parallel_id.has_value()) { return tensor; }
-  const auto& broadcast_parallel_desc = JUST(GetBroadcastSubParallelDesc(parallel_desc, nd_sbp));
-  if (broadcast_parallel_desc->parallel_num() == 1 /* no broadcast */) { return tensor; }
-  std::shared_ptr<UserOpExpr> op_expr =
-      JUST(FindOrCreatEagerNcclBroadcastOpExpr(broadcast_parallel_desc));
-  if (JUST(broadcast_parallel_desc->MachineId4ParallelId(0)) == GlobalProcessCtx::Rank()) {
+Maybe<Tensor> Broadcast(const std::shared_ptr<Tensor>& tensor, Symbol<ParallelDesc> parallel_desc) {
+  CHECK_OR_RETURN(parallel_desc->containing_current_rank());
+  if (parallel_desc->parallel_num() == 1 /* no broadcast */) { return tensor; }
+  std::shared_ptr<UserOpExpr> op_expr = JUST(FindOrCreatEagerNcclBroadcastOpExpr(parallel_desc));
+  if (JUST(parallel_desc->MachineId4ParallelId(0)) == GlobalProcessCtx::Rank()) {
     // inplace.
     TensorTuple outputs{tensor};
     JUST(OpInterpUtil::Dispatch(*op_expr, {tensor}, &outputs,
-                                one::OpExprInterpContext(AttrMap{}, broadcast_parallel_desc)));
+                                one::OpExprInterpContext(AttrMap{}, parallel_desc)));
     return tensor;
   } else {
     return JUST(OpInterpUtil::Dispatch<one::Tensor>(
-        *op_expr, {tensor}, one::OpExprInterpContext(AttrMap{}, broadcast_parallel_desc)));
+        *op_expr, {tensor}, one::OpExprInterpContext(AttrMap{}, parallel_desc)));
   }
 }
 
+namespace {
+
+Maybe<Tensor> GetSyncedTensorIfBroadcast(const std::shared_ptr<Tensor>& tensor,
+                                         Symbol<ParallelDesc> parallel_desc,
+                                         Symbol<cfg::NdSbp> nd_sbp) {
+  Optional<int64_t> parallel_id;
+  JUST(GetDevice4CurrentProcessCtx(parallel_desc, &parallel_id));
+  if (!parallel_id.has_value()) { return tensor; }
+  const auto& broadcast_parallel_desc = JUST(GetBroadcastSubParallelDesc(parallel_desc, nd_sbp));
+  return Broadcast(tensor, broadcast_parallel_desc);
+}
+
 Maybe<Shape> CalcPhysicalShape(Symbol<ConsistentTensorMeta> consistent_tensor_meta) {
   const auto& opt_parallel_id =
       JUST(GetParallelId4CurrentProcessCtx(consistent_tensor_meta->parallel_desc()));
@@ -294,9 +303,16 @@ Maybe<Tensor> TryReshapeTensor(const std::shared_ptr<Tensor>& tensor,
 
 }  // namespace
 
-Maybe<void> EagerMirroredInterpreter::ApplyImpl(const CastToConsistentOpExpr& op_expr,
+Maybe<void> EagerMirroredInterpreter::ApplyImpl(const ConsistentToConsistentOpExpr& op_expr,
                                                 const TensorTuple& inputs, TensorTuple* outputs,
                                                 const OpExprInterpContext& ctx) const {
+  OF_UNIMPLEMENTED();
+}
+
+namespace {
+
+Maybe<void> RawLocalToConsistent(const CastToConsistentOpExpr& op_expr, const TensorTuple& inputs,
+                                 TensorTuple* outputs, const OpExprInterpContext& ctx) {
   std::shared_ptr<MirroredTensor> input_mirrored_tensor;
   {
     CHECK_EQ_OR_RETURN(inputs.size(), 1);
@@ -323,23 +339,46 @@ Maybe<void> EagerMirroredInterpreter::ApplyImpl(const CastToConsistentOpExpr& op
     const auto& consistent_tensor_impl = JUST(EagerConsistentTensorImpl::New(
         SymbolOf(tensor_meta), device, parallel_id, input_mirrored_tensor->requires_grad(),
         !input_mirrored_tensor->requires_grad()));
-    const auto& transport_token = JUST(TransportToken::NewMetaTransportToken());
-    JUST(consistent_tensor_impl->set_transport_token(transport_token));
     consistent_tensor = std::make_shared<ConsistentTensor>(consistent_tensor_impl);
-    JUST(WithConsistencyChecked(consistent_tensor, [&]() -> Maybe<void> {
-      if (!parallel_id.has_value()) { return Maybe<void>::Ok(); }
-      const auto& reshaped_tensor = JUST(TryReshapeTensor(input_mirrored_tensor, tensor_meta));
-      const auto& synced_tensor =
-          JUST(GetSyncedTensorIfBroadcast(reshaped_tensor, parallel_desc, nd_sbp));
+    if (parallel_id.has_value()) {
       CHECK_EQ_OR_RETURN(dtype, input_mirrored_tensor->dtype()->data_type());
-      consistent_tensor_impl->reset_cur_rank_phy_tensor(JUST(synced_tensor->AsMirroredTensor()));
-      return Maybe<void>::Ok();
-    }));
+      consistent_tensor_impl->reset_cur_rank_phy_tensor(input_mirrored_tensor);
+    }
   }
   outputs->at(0) = consistent_tensor;
   return Maybe<void>::Ok();
 }
 
+static constexpr auto* LocalToConsistent =
+    DECORATE(&RawLocalToConsistent, NonRecursiveInitConsistentId);
+
+}  // namespace
+
+Maybe<void> EagerMirroredInterpreter::ApplyImpl(const CastToConsistentOpExpr& op_expr,
+                                                const TensorTuple& inputs, TensorTuple* outputs,
+                                                const OpExprInterpContext& ctx) const {
+  JUST(LocalToConsistent(op_expr, inputs, outputs, ctx));
+  const auto& consistent_tensor = JUST(outputs->at(0)->AsConsistentTensor());
+  JUST(WithConsistencyChecked(consistent_tensor, [&]() -> Maybe<void> {
+    if (IsConsistentTensorMetaCheckDisabled()) { return Maybe<void>::Ok(); }
+    const auto& parallel_desc = JUST(ctx.parallel_desc.value());
+    const auto& parallel_id = JUST(GetParallelId4CurrentProcessCtx(parallel_desc));
+    if (!parallel_id->has_value()) { return Maybe<void>::Ok(); }
+    const auto& nd_sbp = JUST(ctx.nd_sbp.value());
+    const auto& tensor_meta = JUST(consistent_tensor->consistent_tensor_meta());
+    const auto& local_tensor = JUST(consistent_tensor->cur_rank_phy_tensor());
+    const auto& reshaped_tensor = JUST(TryReshapeTensor(local_tensor, tensor_meta));
+    const auto& synced_tensor =
+        JUST(GetSyncedTensorIfBroadcast(reshaped_tensor, parallel_desc, nd_sbp));
+    auto* consistent_tensor_impl =
+        reinterpret_cast<EagerConsistentTensorImpl*>(consistent_tensor->mut_impl());
+    CHECK_NOTNULL_OR_RETURN(consistent_tensor_impl);
+    consistent_tensor_impl->reset_cur_rank_phy_tensor(JUST(synced_tensor->AsMirroredTensor()));
+    return Maybe<void>::Ok();
+  }));
+  return Maybe<void>::Ok();
+}
+
 Maybe<void> EagerMirroredInterpreter::ApplyImpl(const CastFromConsistentOpExpr& op_expr,
                                                 const TensorTuple& inputs, TensorTuple* outputs,
                                                 const OpExprInterpContext& ctx) const {
diff --git a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h
index 894746df549..340e54fed9a 100644
--- a/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h
+++ b/oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h
@@ -19,8 +19,14 @@ namespace oneflow {
 
 class Device;
 class TensorTuple;
+class ParallelDesc;
 
 namespace one {
+
+class Tensor;
+
 Maybe<void> RunEmptyOp(TensorTuple* outputs);
-}
+Maybe<Tensor> Broadcast(const std::shared_ptr<Tensor>& tensor, Symbol<ParallelDesc> parallel_desc);
+
+}  // namespace one
 }  // namespace oneflow
diff --git a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp
index b6ee5267736..6f9597c483b 100644
--- a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp
+++ b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp
@@ -582,9 +582,6 @@ Maybe<void> LazyInterpreter::ApplyImpl(const ConsistentToConsistentOpExpr& op_ex
   CHECK_OR_RETURN(input_tensor->is_lazy());
   CHECK_OR_RETURN(input_tensor->is_consistent());
 
-  bool identity_grad = JUST(ctx.attrs.GetAttr<bool>("identity_grad"));
-  const auto& grad_sbp_list = JUST(ctx.attrs.GetAttr<std::vector<std::string>>("grad_sbp"));
-
   CHECK_OR_RETURN(ctx.parallel_desc.has_value());
   const auto& parallel_desc_sym = JUST(ctx.parallel_desc.value());
   CHECK_OR_RETURN(ctx.nd_sbp.has_value());
@@ -611,24 +608,22 @@ Maybe<void> LazyInterpreter::ApplyImpl(const ConsistentToConsistentOpExpr& op_ex
   }
 
   // build parallel cast op expr
+  std::shared_ptr<std::vector<std::string>> sbp_list_ptr = JUST(GetNdSbpStrList(sbp_sym));
   std::string grad_mode;
-  std::vector<std::string> grad_parallel_distribution;
-  if (identity_grad) {
-    grad_mode = "identity";
-  } else if (grad_sbp_list.size() > 0) {
+  std::vector<std::string> grad_sbp_str_list;
+  if (op_expr.grad_nd_sbp().has_value()) {
     grad_mode = "manual";
-    grad_parallel_distribution = grad_sbp_list;
+    grad_sbp_str_list = *JUST(GetNdSbpStrList(JUST(op_expr.grad_nd_sbp().value())));
   } else {
-    grad_mode = "restore";
+    grad_mode = "identity";
   }
-  auto sbp_list_ptr = JUST(GetNdSbpStrList(sbp_sym));
   std::shared_ptr<UserOpExpr> parallel_cast_op_expr =
       JUST(OpBuilder("hierarchical_parallel_cast", "trivial_op_name")
                .Input("in")
                .Output("out")
                .Attr<std::vector<std::string>>("nd_sbp", *sbp_list_ptr)
                .Attr<std::string>("grad_mode", grad_mode)
-               .Attr<std::vector<std::string>>("grad_nd_sbp", grad_parallel_distribution)
+               .Attr<std::vector<std::string>>("grad_nd_sbp", grad_sbp_str_list)
                .Build());
 
   CHECK_EQ_OR_RETURN(op_expr.output_size(), 1);
diff --git a/oneflow/core/framework/op_interpreter/op_interpreter.cpp b/oneflow/core/framework/op_interpreter/op_interpreter.cpp
index 7481821d58f..b1eb8dbf4cb 100644
--- a/oneflow/core/framework/op_interpreter/op_interpreter.cpp
+++ b/oneflow/core/framework/op_interpreter/op_interpreter.cpp
@@ -57,6 +57,7 @@ Maybe<void> EagerInterpreter::Apply(const OpExpr& op_expr, const TensorTuple& in
   APPLY_IF(VariableOp);
   APPLY_IF(CastToMirroredOp);
   APPLY_IF(CastFromMirroredOp);
+  APPLY_IF(ConsistentToConsistentOp);
   APPLY_IF(CastToConsistentOp);
   APPLY_IF(CastFromConsistentOp);
   APPLY_IF(DistributeSplitOp);
diff --git a/oneflow/core/framework/tensor_consistent_id.cpp b/oneflow/core/framework/tensor_consistent_id.cpp
index 74192f153c7..f76433f1859 100644
--- a/oneflow/core/framework/tensor_consistent_id.cpp
+++ b/oneflow/core/framework/tensor_consistent_id.cpp
@@ -28,10 +28,9 @@ int64_t* MutThreadLocalRecursiveDepth() {
 }
 
 Maybe<void> InitConsistentId(TensorTuple* outputs) {
-  for (int i = 0; i < outputs->size(); ++i) {
-    const auto& consistent_tensor = std::dynamic_pointer_cast<ConsistentTensor>(outputs->at(i));
-    CHECK_OR_RETURN(consistent_tensor)
-        << Error::Unimplemented() << "consistent tensors suppported only.";
+  for (const auto& output : *outputs) {
+    CHECK_OR_RETURN(output);
+    const auto& consistent_tensor = JUST(output->AsConsistentTensor());
     const auto& transport_token = JUST(TransportToken::NewMetaTransportToken());
     JUST(consistent_tensor->mut_impl()->set_transport_token(transport_token));
   }
diff --git a/oneflow/core/framework/tensor_consistent_id.h b/oneflow/core/framework/tensor_consistent_id.h
index fc35a548898..19c8f0fd53b 100644
--- a/oneflow/core/framework/tensor_consistent_id.h
+++ b/oneflow/core/framework/tensor_consistent_id.h
@@ -30,16 +30,15 @@ Maybe<void> InitConsistentId(TensorTuple* outputs);
 template<typename... Args>
 struct NonRecursiveInitConsistentId;
 
-template<typename RetT, typename Arg0, typename Arg1, typename... Args>
-struct NonRecursiveInitConsistentId<RetT, Arg0, Arg1, TensorTuple*, Args...> {
-  static_assert(is_maybe<RetT>::value, "");
-  template<RetT (*func)(Arg0, Arg1, TensorTuple*, Args...)>
-  static RetT Call(Arg0 arg0, Arg1 arg1, TensorTuple* outputs, Args... args) {
+template<typename Arg0, typename Arg1, typename... Args>
+struct NonRecursiveInitConsistentId<Maybe<void>, Arg0, Arg1, TensorTuple*, Args...> {
+  template<Maybe<void> (*func)(Arg0, Arg1, TensorTuple*, Args...)>
+  static Maybe<void> Call(Arg0 arg0, Arg1 arg1, TensorTuple* outputs, Args... args) {
     auto* recursive_depth = MutThreadLocalRecursiveDepth();
     ++*recursive_depth;
-    RetT ret = func(arg0, arg1, outputs, args...);
+    Maybe<void> ret = func(arg0, arg1, outputs, args...);
     --*recursive_depth;
-    if (*recursive_depth == 0) { JUST(InitConsistentId(outputs)); }
+    if (*recursive_depth == 0 && ret.IsOk()) { JUST(InitConsistentId(outputs)); }
     return ret;
   }
 };
diff --git a/oneflow/core/framework/tensor_rpc_util.h b/oneflow/core/framework/tensor_rpc_util.h
index f1e37246b3e..04cb9d1c5f1 100644
--- a/oneflow/core/framework/tensor_rpc_util.h
+++ b/oneflow/core/framework/tensor_rpc_util.h
@@ -40,6 +40,10 @@ Maybe<void> RunCallback(const std::shared_ptr<one::Tensor>& tensor,
 
 }  // namespace private_details
 
+inline bool IsConsistentTensorMetaCheckDisabled() {
+  return *private_details::MutThreadLocalDepth() > 1;
+}
+
 template<typename... Args>
 struct CheckConsistentTensorMeta;
 
@@ -60,6 +64,11 @@ struct CheckConsistentTensorMeta<RetT, const std::shared_ptr<one::Tensor>&, Args
   }
 };
 
+struct DisableCheckConsistentTensorMetaScope final {
+  DisableCheckConsistentTensorMetaScope() { ++*private_details::MutThreadLocalDepth(); }
+  ~DisableCheckConsistentTensorMetaScope() { --*private_details::MutThreadLocalDepth(); }
+};
+
 static constexpr auto* WithConsistencyChecked =
     DECORATE(&private_details::RunCallback, CheckConsistentTensorMeta);
 
diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml
index 0eebf46d75e..35b00c6d708 100644
--- a/oneflow/core/functional/functional_api.yaml
+++ b/oneflow/core/functional/functional_api.yaml
@@ -959,7 +959,7 @@
   bind_python: True
 
 - name: "to_consistent"
-  signature: "Tensor ToConsistent(Tensor x, *, Placement placement, SbpList sbp, Bool identity_grad=False, SbpList grad_sbp)"
+  signature: "Tensor ToConsistent(Tensor x, *, Placement placement, SbpList sbp, SbpList grad_sbp)"
   bind_python: True
 
 - name: "to_local"
@@ -970,6 +970,10 @@
   signature: "Tensor AllReduce(Tensor x)"
   bind_python: True
 
+- name: "broadcast"
+  signature: "Tensor Broadcast(Tensor x)"
+  bind_python: True
+
 - name: "select_first"
   signature: "Tensor SelectFirst(TensorTuple inputs)"
   bind_python: True
@@ -1037,6 +1041,18 @@
                                      SbpList sbp_tuple, DataType dtype=None, Generator generator=None)"
   bind_python: True
 
+- name: "randint"
+  signature: "Tensor RandInt(Int64 low, Int64 high, *, Shape shape, DataType dtype=None, Device device=None, Generator generator=None)"
+  bind_python: True
+
+- name: "consistent_randint"
+  signature: "Tensor ConsistentRandInt(Int64 low, Int64 high, *, Shape shape, Placement placement, SbpList sbp_tuple, DataType dtype=None, Generator generator=None)"
+  bind_python: TensorTuple
+
+- name: "randperm"
+  signature: "Tensor RandPerm(Int32 n, *, DataType dtype=kInt64, Device device=None, Generator generator=None)"
+  bind_python: True
+
 - name: "scalar_fmod"
   signature: "Tensor ScalarFMod(Tensor in, Scalar scalar)"
   bind_python: False
@@ -1057,17 +1073,6 @@
   signature: "Tensor L2NormalizeGrad(Tensor dy, Tensor y, Tensor square_x_sum, Int32 axis, Float epsilon, *)"
   bind_python: False
 
-- name: "randint"
-  signature: "Tensor Randint(Int64 low,Int64 high,Shape shape, *,Device device=None,Generator generator=None)"
-  bind_python: True
-
-- name: "consistent_randint"
-  signature: "Tensor ConsistentRandint(Int64 low,Int64 high,Shape shape, *,Placement placement, SbpList sbp_tuple, Generator generator=None)"
-
-- name: "randperm"
-  signature: "Tensor Randperm(Int32 n,*,Device device=None, Generator generator=None)"
-  bind_python: True
-
 - name: "consistent_randperm"
   signature: "Tensor ConsistentRandperm(Int32 n,*, Placement placement, SbpList sbp_tuple, Generator generator=None)"
   bind_python: True
diff --git a/oneflow/core/functional/impl/comm_functor.cpp b/oneflow/core/functional/impl/comm_functor.cpp
index 46572ccd7ab..7abd3041b58 100644
--- a/oneflow/core/functional/impl/comm_functor.cpp
+++ b/oneflow/core/functional/impl/comm_functor.cpp
@@ -18,6 +18,7 @@ limitations under the License.
 #include "oneflow/core/framework/op_builder.h"
 #include "oneflow/core/framework/op_expr.h"
 #include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h"
 #include "oneflow/core/framework/tensor.h"
 #include "oneflow/core/framework/tensor_tuple.h"
 #include "oneflow/core/functional/functional.h"
@@ -33,6 +34,19 @@ namespace one {
 namespace functional {
 
 namespace impl {
+class BroadcastFunctor {
+ public:
+  BroadcastFunctor() = default;
+  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x) const {
+    const auto& rank_group = JUST(RankGroupScope::CurrentRankGroup());
+    std::string device_type_str = JUST(x->device())->type();
+    CHECK_OR_RETURN(device_type_str == "cuda" || device_type_str == "cpu");
+    DeviceType device_type = device_type_str == "cuda" ? DeviceType::kGPU : DeviceType::kCPU;
+    const auto& parallel_desc = JUST(RankGroup::GetDefaultParallelDesc(device_type, rank_group));
+    return one::Broadcast(x, parallel_desc);
+  }
+};
+
 class AllReduceFunctor {
  public:
   AllReduceFunctor() = default;
@@ -75,7 +89,10 @@ class AllReduceFunctor {
 };
 }  // namespace impl
 
-ONEFLOW_FUNCTION_LIBRARY(m) { m.add_functor<impl::AllReduceFunctor>("AllReduce"); };
+ONEFLOW_FUNCTION_LIBRARY(m) {
+  m.add_functor<impl::AllReduceFunctor>("AllReduce");
+  m.add_functor<impl::BroadcastFunctor>("Broadcast");
+};
 
 }  // namespace functional
 }  // namespace one
diff --git a/oneflow/core/functional/impl/consistent_cast.cpp b/oneflow/core/functional/impl/consistent_cast.cpp
index 30a0e312f98..6bd3392b4cb 100644
--- a/oneflow/core/functional/impl/consistent_cast.cpp
+++ b/oneflow/core/functional/impl/consistent_cast.cpp
@@ -30,6 +30,7 @@ limitations under the License.
 #include "oneflow/core/job/global_for.h"
 #include "oneflow/core/job/resource_desc.h"
 #include "oneflow/core/job/rank_group_scope.h"
+#include "oneflow/core/job/lazy_mode.h"
 #include "oneflow/core/framework/transport_token.h"
 #include "oneflow/core/framework/transport_util.h"
 #include "oneflow/core/framework/placement_sbp_util.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "oneflow/core/common/container_util.h"
 #include "oneflow/core/common/balanced_splitter.h"
 #include "oneflow/core/common/decorator.h"
+#include "oneflow/core/common/optional.h"
 #include "oneflow/core/ccl/ccl.h"
 
 namespace oneflow {
@@ -201,49 +203,37 @@ Maybe<void> GetLogicalShapeAndDataType(Shape* logical_shape, DataType* /* in and
   return Maybe<void>::Ok();
 }
 
-Maybe<one::UserOpExpr> MakeNdSbpOpExpr(const std::vector<Symbol<cfg::SbpParallel>>& sbp_parallels) {
-  return OpBuilder("hierarchical_parallel_cast", *JUST(UniqueStr("hierarchical_parallel_cast")))
-      .Input("in")
-      .Output("out")
-      .Attr<std::vector<std::string>>("nd_sbp", *JUST(GetNdSbpStrList(sbp_parallels)))
-      .Attr<std::string>("grad_mode", "restore")
-      .Attr<std::vector<std::string>>("grad_nd_sbp", std::vector<std::string>())
-      .Build();
-}
-
-auto* CachedNdSbpOpExpr = DECORATE(&MakeNdSbpOpExpr, ThreadLocalCopiable);
-
-Maybe<Tensor> ConsistentToConsistent(const std::shared_ptr<Tensor>& x,
-                                     Symbol<ParallelDesc> parallel_desc,
-                                     const std::vector<Symbol<cfg::SbpParallel>>& sbp_parallels) {
-  const auto& consistent_tensor = JUST(x->AsConsistentTensor());
-  CHECK_NOTNULL_OR_RETURN(consistent_tensor) << "consistent tensors supported only";
-  CHECK_OR_RETURN(consistent_tensor->is_eager()) << "eager tensors supported only";
-  const auto& nd_sbp_cast_op_expr = JUST(CachedNdSbpOpExpr(sbp_parallels));
+namespace {
 
-  const auto& ret =
-      JUST(OpInterpUtil::Dispatch<one::Tensor>(*nd_sbp_cast_op_expr, {consistent_tensor}));
-  return ret;
+Maybe<one::OpExpr> RawGetConsistentToConsistentOpExpr(
+    const std::vector<Symbol<cfg::SbpParallel>>& grad_sbp_parallels) {
+  Optional<Symbol<cfg::NdSbp>> grad_nd_sbp;
+  if (!grad_sbp_parallels.empty()) { grad_nd_sbp = JUST(GetNdSbp(grad_sbp_parallels)); }
+  std::shared_ptr<one::OpExpr> op_expr = JUST(one::ConsistentToConsistentOpExpr::New(grad_nd_sbp));
+  return op_expr;
 }
 
-Maybe<Tensor> LazyConsistentToConsistent(
-    const std::shared_ptr<Tensor>& x, Symbol<ParallelDesc> parallel_desc,
-    const std::vector<Symbol<cfg::SbpParallel>>& sbp_parallels, bool identity_grad,
-    const std::vector<Symbol<cfg::SbpParallel>>& grad_sbp_parallels,
-    const std::shared_ptr<OpExpr>& op) {
-  CHECK_OR_RETURN(x->is_lazy());
-  CHECK_OR_RETURN(x->is_consistent());
-
-  Symbol<cfg::NdSbp> parallel_distribution = JUST(GetNdSbp(sbp_parallels));
-  std::vector<std::string> grad_parallel_distribution = *JUST(GetNdSbpStrList(grad_sbp_parallels));
+}  // namespace
 
-  MutableAttrMap attrs;
-  JUST(attrs.SetAttr<bool>("identity_grad", identity_grad));
-  JUST(attrs.SetAttr<std::vector<std::string>>("grad_sbp", grad_parallel_distribution));
+static constexpr auto* GetConsistentToConsistentOpExpr =
+    DECORATE(&RawGetConsistentToConsistentOpExpr, ThreadLocalCopiable);
 
-  const auto& output = JUST(OpInterpUtil::Dispatch<one::Tensor>(
-      *op, {x}, OpExprInterpContext(attrs, parallel_desc, parallel_distribution)));
-  return output;
+Maybe<Tensor> ConsistentToConsistent(
+    const std::shared_ptr<Tensor>& x, Symbol<ParallelDesc> parallel_desc,
+    const std::vector<Symbol<cfg::SbpParallel>>& sbp_parallels,
+    const std::vector<Symbol<cfg::SbpParallel>>& grad_sbp_parallels) {
+  const auto& consistent_tensor = JUST(x->AsConsistentTensor());
+  CHECK_NOTNULL_OR_RETURN(consistent_tensor) << "consistent tensors supported only";
+  const auto& op = JUST(GetConsistentToConsistentOpExpr(grad_sbp_parallels));
+  const auto& nd_sbp = JUST(GetNdSbp(sbp_parallels));
+  const auto& tensor = JUST(OpInterpUtil::Dispatch<one::Tensor>(
+      *op, {consistent_tensor}, OpExprInterpContext(AttrMap{}, parallel_desc, nd_sbp)));
+  if (!LazyMode::is_enabled() && tensor != x) {
+    const auto& input_consistent_id = JUST(x->transport_token());
+    const auto& output_consistend_id = JUST(tensor->transport_token());
+    CHECK_NE_OR_RETURN(input_consistent_id, output_consistend_id);
+  }
+  return tensor;
 }
 
 Maybe<Tensor> LocalToConsistent(const std::shared_ptr<Tensor>& x,
@@ -293,30 +283,23 @@ class ToConsistentFunctor {
   ToConsistentFunctor() {
     local_to_consistent_op_ =
         CHECK_JUST(one::CastToConsistentOpExpr::New(*CHECK_JUST(UniqueStr("cast_to_consistent"))));
-    consistent_to_consistent_op_ = CHECK_JUST(
-        one::ConsistentToConsistentOpExpr::New(*CHECK_JUST(UniqueStr("consistent_to_consistent"))));
   }
 
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
                            Symbol<ParallelDesc> parallel_desc,
                            const std::vector<Symbol<cfg::SbpParallel>>& sbp_parallels,
-                           bool identity_grad,
                            const std::vector<Symbol<cfg::SbpParallel>>& grad_sbp_parallels) const {
+    std::shared_ptr<Tensor> tensor;
     if (x->is_consistent()) {
-      if (x->is_lazy()) {
-        return JUST(LazyConsistentToConsistent(x, parallel_desc, sbp_parallels, identity_grad,
-                                               grad_sbp_parallels, consistent_to_consistent_op_));
-      } else {
-        return JUST(ConsistentToConsistent(x, parallel_desc, sbp_parallels));
-      }
+      tensor = JUST(ConsistentToConsistent(x, parallel_desc, sbp_parallels, grad_sbp_parallels));
     } else {
-      return JUST(LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_));
+      tensor = JUST(LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_));
     }
+    return tensor;
   }
 
  private:
   std::shared_ptr<OpExpr> local_to_consistent_op_;
-  std::shared_ptr<OpExpr> consistent_to_consistent_op_;
 };
 
 class ConsistentToLocalFunctor {
diff --git a/oneflow/core/functional/impl/random_functor.cpp b/oneflow/core/functional/impl/random_functor.cpp
index 612467a9d3b..52d06dbbf70 100644
--- a/oneflow/core/functional/impl/random_functor.cpp
+++ b/oneflow/core/functional/impl/random_functor.cpp
@@ -252,17 +252,28 @@ class ConsistentRandNFunctor {
  private:
   std::shared_ptr<OpExpr> op_;
 };
-class RandintFunctor {
+class RandIntFunctor {
  public:
-  RandintFunctor() { randint_op_ = CHECK_JUST(one::OpBuilder("randint").Output("out").Build()); }
+  RandIntFunctor() { op_ = CHECK_JUST(one::OpBuilder("uniform").Output("out").Build()); }
 
   Maybe<Tensor> operator()(const int64_t low, const int64_t high, const Shape& shape,
+                           const Optional<Symbol<DType>>& dtype,
                            const Optional<Symbol<Device>>& device,
                            const Optional<one::Generator>& generator) const {
+    DataType dtype_val = DataType::kInt64;
+    if (dtype.has_value()) {
+      dtype_val = JUST(dtype.value())->data_type();
+
+      if (dtype_val != DataType::kFloat && dtype_val != DataType::kDouble) {
+        OF_UNIMPLEMENTED() << dtype_val << "not supported in randn";
+      }
+    }
+
     MutableAttrMap attrs;
     JUST(attrs.SetAttr<Shape>("shape", shape));
-    JUST(attrs.SetAttr<int64_t>("low", low));
-    JUST(attrs.SetAttr<int64_t>("high", high));
+    JUST(attrs.SetAttr<double>("low", low));
+    JUST(attrs.SetAttr<double>("high", high));
+    JUST(attrs.SetAttr<DataType>("dtype", dtype_val));
 
     std::shared_ptr<one::Generator> gen;
     if (!generator) {
@@ -270,40 +281,55 @@ class RandintFunctor {
     } else {
       gen = JUST(generator.value());
     }
-    const auto& randint_kernel_state = std::make_shared<UniformKernelState>(gen);
+    JUST(attrs.SetAttr<int64_t>("seed", gen->current_seed()));
+
+    const auto& uniform_kernel_state = std::make_shared<UniformKernelState>(gen);
     if (device.has_value()) {
       Symbol<Device> device_symbol = JUST(device.value());
       return OpInterpUtil::Dispatch<Tensor>(
-          *randint_op_, {}, OpExprInterpContext(attrs, device_symbol, randint_kernel_state));
+          *op_, {}, OpExprInterpContext(attrs, device_symbol, uniform_kernel_state));
     } else {
-      return OpInterpUtil::Dispatch<Tensor>(*randint_op_, {},
-                                            OpExprInterpContext(attrs, randint_kernel_state));
+      return OpInterpUtil::Dispatch<Tensor>(*op_, {},
+                                            OpExprInterpContext(attrs, uniform_kernel_state));
     }
   }
 
  private:
-  std::shared_ptr<OpExpr> randint_op_;
+  std::shared_ptr<OpExpr> op_;
 };
-class ConsistentRandintFunctor {
+
+class ConsistentRandIntFunctor {
  public:
-  ConsistentRandintFunctor() {
-    randint_op_ = CHECK_JUST(one::OpBuilder("randint").Output("out").Build());
-  }
+  ConsistentRandIntFunctor() { op_ = CHECK_JUST(one::OpBuilder("uniform").Output("out").Build()); }
   Maybe<Tensor> operator()(const int64_t low, const int64_t high, const Shape& shape,
                            const Symbol<ParallelDesc>& placement,
                            const std::vector<Symbol<cfg::SbpParallel>>& sbp_tuple,
+                           const Optional<Symbol<DType>>& dtype,
                            const Optional<one::Generator>& generator) const {
+    DataType dtype_val = DataType::kInt64;
+    if (dtype.has_value()) {
+      dtype_val = JUST(dtype.value())->data_type();
+
+      if (dtype_val != DataType::kFloat && dtype_val != DataType::kDouble) {
+        OF_UNIMPLEMENTED() << dtype_val << "not supported in randn";
+      }
+    }
+
     MutableAttrMap attrs;
     JUST(attrs.SetAttr<Shape>("shape", shape));
-    JUST(attrs.SetAttr<int64_t>("low", low));
-    JUST(attrs.SetAttr<int64_t>("high", high));
+    JUST(attrs.SetAttr<double>("low", low));
+    JUST(attrs.SetAttr<double>("high", high));
+    JUST(attrs.SetAttr<DataType>("dtype", dtype_val));
     std::shared_ptr<one::Generator> gen;
     if (!generator) {
       gen = JUST(one::DefaultAutoGenerator());
     } else {
       gen = JUST(generator.value());
     }
-    const auto& randint_kernel_state = std::make_shared<UniformKernelState>(gen);
+
+    JUST(attrs.SetAttr<int64_t>("seed", gen->current_seed()));
+
+    const auto& uniform_kernel_state = std::make_shared<UniformKernelState>(gen);
 
     if (LazyMode::is_enabled()) {
       std::vector<std::string> nd_sbp(sbp_tuple.size());
@@ -317,11 +343,11 @@ class ConsistentRandintFunctor {
     const auto& nd_sbp = JUST(GetNdSbp(sbp_tuple));
 
     return OpInterpUtil::Dispatch<Tensor>(
-        *randint_op_, {}, OpExprInterpContext(attrs, placement, nd_sbp, randint_kernel_state));
+        *op_, {}, OpExprInterpContext(attrs, placement, nd_sbp, uniform_kernel_state));
   }
 
  private:
-  std::shared_ptr<OpExpr> randint_op_;
+  std::shared_ptr<OpExpr> op_;
 };
 
 class RandPermFunctor {
@@ -398,14 +424,14 @@ class ConsistentRandPermFunctor {
 
 ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::BernoulliFunctor>("Bernoulli");
-  m.add_functor<impl::RandPermFunctor>("Randperm");
-  m.add_functor<impl::ConsistentRandPermFunctor>("ConsistentRandperm");
+  m.add_functor<impl::RandPermFunctor>("RandPerm");
+  m.add_functor<impl::ConsistentRandPermFunctor>("ConsistentRandPerm");
   m.add_functor<impl::RandFunctor>("Rand");
   m.add_functor<impl::ConsistentRandFunctor>("ConsistentRand");
   m.add_functor<impl::RandNFunctor>("RandN");
   m.add_functor<impl::ConsistentRandNFunctor>("ConsistentRandN");
-  m.add_functor<impl::RandintFunctor>("Randint");
-  m.add_functor<impl::ConsistentRandintFunctor>("ConsistentRandint");
+  m.add_functor<impl::RandIntFunctor>("RandInt");
+  m.add_functor<impl::ConsistentRandIntFunctor>("ConsistentRandInt");
 };
 
 }  // namespace functional
diff --git a/oneflow/core/functional/tensor_index.cpp b/oneflow/core/functional/tensor_index.cpp
index a7d982b2a3b..7d10d27bd63 100644
--- a/oneflow/core/functional/tensor_index.cpp
+++ b/oneflow/core/functional/tensor_index.cpp
@@ -18,6 +18,7 @@ limitations under the License.
 #include "oneflow/core/framework/tensor.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/tensor_tuple.h"
+#include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/functional/functional.h"
 #include "oneflow/core/job/sbp_parallel.h"
 
@@ -244,8 +245,8 @@ Maybe<Tensor> ApplyAdvancedIndexing(const std::shared_ptr<Tensor>& input,
   if (transposed_input->is_consistent()) {
     const auto& placement = JUST(transposed_input->parallel_desc());
     const auto& broadcast_sbp = JUST(MakeBroadcastSbpParallel());
-    packed_indices = JUST(ToConsistent(packed_indices, placement, {broadcast_sbp},
-                                       /*identity_grad=*/false, /*grad_sbp_parallels=*/{}));
+    packed_indices =
+        JUST(ToConsistent(packed_indices, placement, {broadcast_sbp}, GetNoneSbpList()));
   }
   Symbol<Device> device = JUST(transposed_input->device());
   if (JUST(packed_indices->device()) != device) {
diff --git a/oneflow/core/job/parallel_desc.cpp b/oneflow/core/job/parallel_desc.cpp
index 81e329c40be..3a230eabd0c 100644
--- a/oneflow/core/job/parallel_desc.cpp
+++ b/oneflow/core/job/parallel_desc.cpp
@@ -155,7 +155,8 @@ Maybe<Symbol<Device>> ParallelDesc::GetDevice4CurrentProcessCtx(
   int64_t machine_id = 0;
   int64_t device_id = 0;
   GlobalProcessCtx::GetCurrentMachineIdAndDeviceId(&machine_id, &device_id);
-  const auto& device = JUST(Device::ThreadLocalGetOrNew(device_tag(), device_id));
+  const auto& device =
+      JUST(Device::ThreadLocalGetOrNew(Device::Type4DeviceTag(device_tag()), device_id));
   int64_t parallel_id_val = -1;
   if (TryGetParallelId(machine_id, device_id, &parallel_id_val)) {
     *parallel_id = parallel_id_val;
@@ -397,4 +398,58 @@ bool IsMirroredParallelContext(const ParallelContext& parallel_ctx) {
   return false;
 }
 
+namespace private_details {
+
+Maybe<Symbol<ParallelDesc>> RawReplaceDeviceType(Symbol<ParallelDesc> parallel_desc,
+                                                 DeviceType device_type) {
+  ParallelConf parallel_conf(parallel_desc->parallel_conf());
+  parallel_conf.set_device_tag(*JUST(DeviceTag4DeviceType(device_type)));
+  return SymbolOf(ParallelDesc(parallel_conf));
+}
+
+Maybe<std::string> RawPlacementToString(Symbol<ParallelDesc> placement) {
+  std::string device_type = placement->device_tag() == "gpu" ? "\"cuda\"" : "\"cpu\"";
+  std::vector<int64_t> sorted_node_ids;
+  HashMap<int64_t, std::vector<int64_t>> node_id2sorted_dev_phy_ids;
+  for (int64_t machine_id : placement->sorted_machine_ids()) {
+    int64_t node_id = GlobalProcessCtx::NodeId(machine_id);
+    if (!std::count(sorted_node_ids.begin(), sorted_node_ids.end(), node_id)) {
+      sorted_node_ids.push_back(node_id);
+    }
+    for (int64_t device_id : placement->sorted_dev_phy_ids(machine_id)) {
+      node_id2sorted_dev_phy_ids[node_id].push_back(device_id);
+    }
+  }
+  std::string machine_device_ids = "{";
+  int64_t node_idx = 0;
+  for (int64_t node_id : sorted_node_ids) {
+    std::string device_name = std::to_string(node_id) + " : [";
+    int64_t device_idx = 0;
+    for (int64_t device_id : node_id2sorted_dev_phy_ids.at(node_id)) {
+      device_name += std::to_string(device_id);
+      if (++device_idx != node_id2sorted_dev_phy_ids.at(node_id).size()) { device_name += ", "; }
+    }
+    device_name += "]";
+    if (++node_idx != sorted_node_ids.size()) { device_name += ", "; }
+    machine_device_ids += device_name;
+  }
+  machine_device_ids += "}";
+  std::string hierarchy = "(";
+  int32_t hierarchy_dim_idx = 0;
+  for (int64_t dim : placement->hierarchy()->dim_vec()) {
+    hierarchy += std::to_string(dim);
+    if (++hierarchy_dim_idx != placement->hierarchy()->dim_vec().size()) {
+      hierarchy += ", ";
+    } else if (placement->hierarchy()->dim_vec().size() == 1) {
+      hierarchy += ",";
+    }
+  }
+  hierarchy += ")";
+  std::string placement_str = "oneflow.placement(device_type=" + device_type
+                              + ", machine_device_ids=" + machine_device_ids
+                              + ", hierarchy=" + hierarchy + ")";
+  return placement_str;
+}
+
+}  // namespace private_details
 }  // namespace oneflow
diff --git a/oneflow/core/job/parallel_desc.h b/oneflow/core/job/parallel_desc.h
index 8575bd3daa7..5756e17f55c 100644
--- a/oneflow/core/job/parallel_desc.h
+++ b/oneflow/core/job/parallel_desc.h
@@ -169,6 +169,19 @@ ParallelConf GenParallelConfOfCpuZeroOnAllMachines();
 
 bool IsMirroredParallelContext(const ParallelContext& parallel_ctx);
 
+namespace private_details {
+
+Maybe<Symbol<ParallelDesc>> RawReplaceDeviceType(Symbol<ParallelDesc>, DeviceType);
+Maybe<std::string> RawPlacementToString(Symbol<ParallelDesc> placement);
+
+}  // namespace private_details
+
+static constexpr auto* ReplaceDeviceType =
+    DECORATE(&private_details::RawReplaceDeviceType, ThreadLocal);
+
+static constexpr auto* PlacementToString =
+    DECORATE(&private_details::RawPlacementToString, ThreadLocal);
+
 }  // namespace oneflow
 
 namespace std {
diff --git a/oneflow/core/job/sbp_parallel.cpp b/oneflow/core/job/sbp_parallel.cpp
index 0ab616e9cb9..97f36037320 100644
--- a/oneflow/core/job/sbp_parallel.cpp
+++ b/oneflow/core/job/sbp_parallel.cpp
@@ -192,24 +192,6 @@ std::string SbpParallelToString(const cfg::SbpParallel& sbp_parallel) {
   return sbp_str;
 }
 
-std::string NdSbpToString(const Symbol<cfg::NdSbp> nd_sbp) {
-  static HashMap<Symbol<cfg::NdSbp>, std::string>* nd_sbp2str =
-      new HashMap<Symbol<cfg::NdSbp>, std::string>();
-  auto iter = nd_sbp2str->find(nd_sbp);
-  if (iter == nd_sbp2str->end()) {
-    std::stringstream nd_sbp_str;
-    nd_sbp_str << "[";
-    int32_t idx = 0;
-    for (const auto& sbp_parallel : nd_sbp->sbp_parallel()) {
-      nd_sbp_str << SbpParallelToString(sbp_parallel);
-      if (++idx != nd_sbp->sbp_parallel_size()) { nd_sbp_str << ", "; }
-    }
-    nd_sbp_str << "]";
-    iter = nd_sbp2str->emplace(nd_sbp, nd_sbp_str.str()).first;
-  }
-  return iter->second;
-}
-
 void SbpSignatureToNdSbpSignature(const cfg::SbpSignature& sbp_signature,
                                   cfg::NdSbpSignature* nd_sbp_signature) {
   for (const auto& pair : sbp_signature.bn_in_op2sbp_parallel()) {
diff --git a/oneflow/core/job/sbp_parallel.h b/oneflow/core/job/sbp_parallel.h
index 7080db25fe2..ac8a909cc3c 100644
--- a/oneflow/core/job/sbp_parallel.h
+++ b/oneflow/core/job/sbp_parallel.h
@@ -53,7 +53,6 @@ void SortSbpSignatureListByCopyCost(
 bool IsValidSbpParallelString(const std::string& sbp_str);
 bool ParseSbpParallelFromString(const std::string& sbp_str, cfg::SbpParallel* sbp_parallel);
 std::string SbpParallelToString(const cfg::SbpParallel& sbp_parallel);
-std::string NdSbpToString(const Symbol<cfg::NdSbp> nd_sbp);
 
 void SbpSignatureToNdSbpSignature(const cfg::SbpSignature& sbp_signature,
                                   cfg::NdSbpSignature* nd_sbp_signature);
diff --git a/oneflow/core/job_rewriter/quantization_aware_training.cpp b/oneflow/core/job_rewriter/quantization_aware_training.cpp
index 97ffaa44729..be5746146f3 100644
--- a/oneflow/core/job_rewriter/quantization_aware_training.cpp
+++ b/oneflow/core/job_rewriter/quantization_aware_training.cpp
@@ -216,13 +216,13 @@ Maybe<bool> InsertQuantOpAfterInt8Ops4QatConfig(const QatConfig& qat_config) {
 user_op::UserOpConfWrapper MultiplyOp(const std::string& name, const std::string& x,
                                       const std::string& y, const int64_t scope_symbol_id,
                                       OpConfMap* inserted_ops) {
-  const auto op_wrapper = user_op::UserOpConfWrapperBuilder(name)
-                              .Op("broadcast_mul")
-                              .Input("x", x)
-                              .Input("y", y)
-                              .Output("z")
-                              .ScopeSymbolId(scope_symbol_id)
-                              .Build();
+  auto op_wrapper = user_op::UserOpConfWrapperBuilder(name)
+                        .Op("broadcast_mul")
+                        .Input("x", x)
+                        .Input("y", y)
+                        .Output("z")
+                        .ScopeSymbolId(scope_symbol_id)
+                        .Build();
   (*inserted_ops)[name] = op_wrapper.op_conf();
   return op_wrapper;
 }
diff --git a/oneflow/core/kernel/user_kernel.cpp b/oneflow/core/kernel/user_kernel.cpp
index ed0803ec170..71a4d4833e7 100644
--- a/oneflow/core/kernel/user_kernel.cpp
+++ b/oneflow/core/kernel/user_kernel.cpp
@@ -524,7 +524,7 @@ class UserKernelComputeContext final : public user_op::KernelComputeContext {
                                     const JobDesc& job_desc)
       : user_op_conf_(kernel_conf.op_attribute().op_conf()),
         device_ctx_(device_ctx),
-        base_ctx_(std::move(UserKernelBaseContext(kernel_conf, job_desc))) {
+        base_ctx_(kernel_conf, job_desc) {
     auto InitInOrOut = [&](const PbMap<std::string, UserOpConf::ListString>& arg_map) {
       for (const auto& it : arg_map) {
         const std::string& arg_name = it.first;
diff --git a/oneflow/core/platform/include/pthread_fork.h b/oneflow/core/platform/include/pthread_fork.h
new file mode 100644
index 00000000000..bdc91ce8de9
--- /dev/null
+++ b/oneflow/core/platform/include/pthread_fork.h
@@ -0,0 +1,29 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_PLATFORM_INCLUDE_PTHREAD_FORK_H_
+#define ONEFLOW_CORE_PLATFORM_INCLUDE_PTHREAD_FORK_H_
+
+namespace oneflow {
+
+namespace pthread_fork {
+
+bool IsForkedSubProcess();
+
+}  // namespace pthread_fork
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_PLATFORM_INCLUDE_PTHREAD_FORK_H_
diff --git a/oneflow/core/platform/lib/pthread_fork.cpp b/oneflow/core/platform/lib/pthread_fork.cpp
new file mode 100644
index 00000000000..732383cbb8b
--- /dev/null
+++ b/oneflow/core/platform/lib/pthread_fork.cpp
@@ -0,0 +1,33 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/platform/include/pthread_fork.h"
+#include "oneflow/core/common/util.h"
+
+namespace oneflow {
+
+namespace pthread_fork {
+
+static bool is_fork = false;
+
+bool IsForkedSubProcess() { return is_fork; }
+static void SetIsForkedSubProcess() { is_fork = true; }
+
+void RegisterForkCallback() { pthread_atfork(nullptr, nullptr, SetIsForkedSubProcess); }
+COMMAND(RegisterForkCallback());
+
+}  // namespace pthread_fork
+
+}  // namespace oneflow
diff --git a/oneflow/core/vm/virtual_machine.cpp b/oneflow/core/vm/virtual_machine.cpp
index 4a9e7e5fbc6..ef9e5b9bc1c 100644
--- a/oneflow/core/vm/virtual_machine.cpp
+++ b/oneflow/core/vm/virtual_machine.cpp
@@ -24,6 +24,7 @@ limitations under the License.
 #include "oneflow/core/common/spin_counter.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/job/parallel_desc.h"
+#include "oneflow/core/platform/include/pthread_fork.h"
 
 namespace oneflow {
 namespace vm {
@@ -579,6 +580,10 @@ void VirtualMachine::__Init__(const VmDesc& vm_desc, ObjectMsgAllocator* allocat
 int64_t InstructionMaxRunningSeconds() { return 60 * 5; }
 
 Maybe<void> VirtualMachine::Receive(InstructionMsgList* compute_instr_msg_list) {
+  CHECK_OR_RETURN(!pthread_fork::IsForkedSubProcess())
+      << "Cannot run OneFlow in forked subprocess. Please add "
+         "'multiprocessing.set_start_method(\"spawn\")' in '__main__' if you are using Python's "
+         "multiprocessing";
   InstructionMsgList new_instr_msg_list;
   OBJECT_MSG_LIST_FOR_EACH_PTR(compute_instr_msg_list, compute_instr_msg) {
     if (!compute_instr_msg->phy_instr_operand()) {
diff --git a/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp b/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp
index cee413a18d1..a74f73ea0df 100644
--- a/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp
+++ b/oneflow/user/kernels/adaptive_pool_cpu_kernel.cpp
@@ -44,7 +44,7 @@ void AvgForwardCompute(user_op::KernelComputeContext* ctx, const int32_t& dim) {
   const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape();
   const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape();
 
-  // TODO: Support 'channels_last'
+  // TODO (Tianyu): Support 'channels_last'
   std::string data_format = "channels_first";
   const Shape& in = GetShape5D(x_shape, data_format, dim);
   const Shape& out = GetShape5D(y_shape, data_format, dim);
@@ -100,7 +100,7 @@ void AvgBackwardCompute(user_op::KernelComputeContext* ctx, const int32_t& dim)
   const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape();
   const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape();
 
-  // TODO: Support 'channels_last'
+  // TODO (Tianyu): Support 'channels_last'
   std::string data_format = "channels_first";
   const Shape& in = GetShape5D(dx_shape, data_format, dim);
   const Shape& out = GetShape5D(dy_shape, data_format, dim);
@@ -234,9 +234,7 @@ class AdaptivePool3DCpuGradKernel final : public user_op::OpKernel {
 #define REGISTER_ADAPTIVE_POOL_KERNEL_WITH_DEVICE(device) \
   REGISTER_ADAPTIVE_POOL_KERNEL(device, float)            \
   REGISTER_ADAPTIVE_POOL_KERNEL(device, double)           \
-  REGISTER_ADAPTIVE_POOL_KERNEL(device, int8_t)           \
-  REGISTER_ADAPTIVE_POOL_KERNEL(device, int32_t)          \
-  REGISTER_ADAPTIVE_POOL_KERNEL(device, int64_t)
+  REGISTER_ADAPTIVE_POOL_KERNEL(device, int)
 
 REGISTER_ADAPTIVE_POOL_KERNEL_WITH_DEVICE(DeviceType::kCPU)
 
@@ -257,9 +255,7 @@ REGISTER_ADAPTIVE_POOL_KERNEL_WITH_DEVICE(DeviceType::kCPU)
 #define REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL_WITH_DEVICE(device) \
   REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, float)            \
   REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, double)           \
-  REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, int8_t)           \
-  REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, int32_t)          \
-  REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, int64_t)
+  REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL(device, int)
 
 REGISTER_ADAPTIVE_POOL_BACKWARD_KERNEL_WITH_DEVICE(DeviceType::kCPU)
 }  // namespace oneflow
diff --git a/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu b/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu
index 4beea9e79de..6d4dff01f7b 100644
--- a/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu
+++ b/oneflow/user/kernels/adaptive_pool_gpu_kernel.cu
@@ -18,6 +18,7 @@ limitations under the License.
 #include "oneflow/core/kernel/kernel_util.cuh"
 #include "oneflow/core/common/data_type.h"
 #include "oneflow/core/kernel/util/cuda_half_util.h"
+#include "oneflow/core/cuda/atomic.cuh"
 #include "oneflow/core/operator/operator_util.h"
 #include "oneflow/user/utils/pool_util.h"
 
@@ -59,6 +60,7 @@ __global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_ele
   const int in_panel_size = in_d * in_h * in_w;
 
   CUDA_1D_KERNEL_LOOP(idx, num_elems) {
+    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
     int bc_idx = idx / out_panel_size;
     int out_d_idx = (idx % out_panel_size) / out_w / out_h;
     int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
@@ -100,6 +102,7 @@ __global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num
   const int in_panel_size = in_d * in_h * in_w;
 
   CUDA_1D_KERNEL_LOOP(idx, num_elems) {
+    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
     int bc_idx = idx / out_panel_size;
     int out_d_idx = (idx % out_panel_size) / out_w / out_h;
     int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
@@ -122,7 +125,10 @@ __global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num
         input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
     for (int id = 0; id < k_d; ++id) {
       for (int ih = 0; ih < k_h; ++ih) {
-        for (int iw = 0; iw < k_w; ++iw) { *(input_ptr + ih * in_w + iw) += grad_delta; }
+        for (int iw = 0; iw < k_w; ++iw) {
+          // TODO (Tianyu): Use 'atmoic::Add' when necessary
+          cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta);
+        }
       }
       input_ptr += in_h * in_w;  // next input depth
     }
@@ -139,7 +145,7 @@ void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
   const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape();
   const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape();
 
-  // TODO: Support 'channels_last'
+  // TODO (Tianyu): Support 'channels_last'
   std::string data_format = "channels_first";
   const Shape& in = GetShape5D(x_shape, data_format, dim);
   const Shape& out = GetShape5D(y_shape, data_format, dim);
@@ -160,7 +166,7 @@ void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
   const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape();
   const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape();
 
-  // TODO: Support 'channels_last'
+  // TODO (Tianyu): Support 'channels_last'
   std::string data_format = "channels_first";
   const Shape& in = GetShape5D(dx_shape, data_format, dim);
   const Shape& out = GetShape5D(dy_shape, data_format, dim);
@@ -258,9 +264,7 @@ class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel {
 
 REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, float);
 REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, double);
-REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int8_t);
-REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int32_t);
-REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int64_t);
+REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int);
 
 #define REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype)           \
   REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad")                             \
@@ -278,9 +282,7 @@ REGISTER_GPU_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kGPU, int64_t);
 
 REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, float);
 REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, double);
-REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, int8_t);
-REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, int32_t);
-REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, int64_t);
+REGISTER_GPU_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kGPU, int);
 
 }  // namespace user_op
 
diff --git a/oneflow/user/kernels/eager_nccl_kernels.cpp b/oneflow/user/kernels/eager_nccl_kernels.cpp
index cf9b97e25a0..15307ffbebd 100644
--- a/oneflow/user/kernels/eager_nccl_kernels.cpp
+++ b/oneflow/user/kernels/eager_nccl_kernels.cpp
@@ -16,6 +16,7 @@ limitations under the License.
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/ccl/ccl.h"
 #include "oneflow/core/job/parallel_desc.h"
+#include "oneflow/core/control/global_process_ctx.h"
 
 namespace oneflow {
 
@@ -57,11 +58,15 @@ class EagerCclBroadcastKernel final : public user_op::OpKernel {
     CHECK(kernel_state != nullptr);
     const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
     user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    CHECK_EQ(in->shape(), out->shape());
-    CHECK_EQ(in->data_type(), out->data_type());
     int64_t root = ctx->Attr<int64_t>("root");
-    CHECK_JUST(ccl::Broadcast<DeviceType::kCPU>(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(),
-                                                in->data_type(), root,
+    const void* in_ptr = nullptr;
+    if (GlobalProcessCtx::Rank() == root) {
+      CHECK_EQ(in->shape(), out->shape());
+      CHECK_EQ(in->data_type(), out->data_type());
+      in_ptr = in->dptr();
+    }
+    CHECK_JUST(ccl::Broadcast<DeviceType::kCPU>(in_ptr, out->mut_dptr(), out->shape().elem_cnt(),
+                                                out->data_type(), root,
                                                 kernel_state->parallel_desc(), ctx->device_ctx()));
   };
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
diff --git a/oneflow/user/kernels/eager_nccl_kernels.cu b/oneflow/user/kernels/eager_nccl_kernels.cu
index a0af139844f..5c83fe7cb8f 100644
--- a/oneflow/user/kernels/eager_nccl_kernels.cu
+++ b/oneflow/user/kernels/eager_nccl_kernels.cu
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/common/container_util.h"
+#include "oneflow/core/control/global_process_ctx.h"
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/device/nccl_util.h"
 #include "oneflow/core/job/eager_nccl_comm_manager.h"
@@ -97,11 +98,15 @@ class EagerNcclBroadcastKernel final : public user_op::OpKernel {
     CHECK(kernel_state != nullptr);
     const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
     user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    CHECK_EQ(in->shape(), out->shape());
-    CHECK_EQ(in->data_type(), out->data_type());
     int64_t root = ctx->Attr<int64_t>("root");
-    OF_NCCL_CHECK(ncclBroadcast(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(),
-                                GetNcclDataType(in->data_type()), root, kernel_state->comm(),
+    const void* in_ptr = nullptr;
+    if (GlobalProcessCtx::Rank() == root) {
+      CHECK_EQ(in->shape(), out->shape());
+      CHECK_EQ(in->data_type(), out->data_type());
+      in_ptr = in->dptr();
+    }
+    OF_NCCL_CHECK(ncclBroadcast(in_ptr, out->mut_dptr(), out->shape().elem_cnt(),
+                                GetNcclDataType(out->data_type()), root, kernel_state->comm(),
                                 ctx->device_ctx()->cuda_stream()));
   };
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
@@ -157,7 +162,6 @@ class EagerNcclReduceScatterKernel final : public user_op::OpKernel {
     CHECK(kernel_state != nullptr);
     const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
     user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    CHECK(!(in->shape() == out->shape()));
     CHECK_EQ(in->data_type(), out->data_type());
     const auto& op_type = ctx->Attr<std::string>("op_type");
     OF_NCCL_CHECK(ncclReduceScatter(in->dptr(), out->mut_dptr(), out->shape().elem_cnt(),
@@ -193,7 +197,6 @@ class EagerNcclAllGatherKernel final : public user_op::OpKernel {
     CHECK(kernel_state != nullptr);
     const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
     user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    CHECK(!(in->shape() == out->shape()));
     CHECK_EQ(in->data_type(), out->data_type());
     OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape().elem_cnt(),
                                 GetNcclDataType(in->data_type()), kernel_state->comm(),
diff --git a/oneflow/user/kernels/randint_kernel.cpp b/oneflow/user/kernels/randint_kernel.cpp
deleted file mode 100644
index ac604ae8426..00000000000
--- a/oneflow/user/kernels/randint_kernel.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/distributions/uniform_kernel.h"
-#include "oneflow/core/device/device_context.h"
-#include "oneflow/core/ndarray/xpu_util.h"
-namespace oneflow {
-
-class CpuRandintKernel final : public user_op::OpKernel {
- public:
-  CpuRandintKernel() = default;
-  ~CpuRandintKernel() = default;
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    const auto& generator = CHECK_JUST(one::MakeAutoGenerator());
-    generator->set_current_seed(ctx->Attr<int64_t>("seed"));
-    return std::make_shared<UniformKernelState>(generator);
-  }
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state) const override {
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    int64_t* output = out->mut_dptr<int64_t>();
-    auto* randint_kernel_state = dynamic_cast<UniformKernelState*>(state);
-    CHECK_NOTNULL(randint_kernel_state);
-    const auto& generator = randint_kernel_state->generator();
-    const auto& cpu_generator = CHECK_JUST(generator->Get<one::CPUGeneratorImpl>());
-    CHECK_NOTNULL(generator);
-    const int64_t n = out->shape().elem_cnt();
-    const int64_t low = ctx->Attr<int64_t>("low");
-    const int64_t high = ctx->Attr<int64_t>("high");
-    std::uniform_int_distribution<int64_t> dis(low, high - 1);
-    XPU_1D_KERNEL_LOOP(i, n)
-    output[i] = dis(cpu_generator->engine());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("randint").SetCreateFn<CpuRandintKernel>().SetIsMatchedHob(
-    (user_op::HobDeviceTag() == "cpu"));
-
-}  // namespace oneflow
diff --git a/oneflow/user/kernels/randint_kernel.cu b/oneflow/user/kernels/randint_kernel.cu
deleted file mode 100644
index da6bc2644e7..00000000000
--- a/oneflow/user/kernels/randint_kernel.cu
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/device/device_context.h"
-#include "oneflow/core/ndarray/xpu_util.h"
-#include "oneflow/user/kernels/distributions/uniform_kernel.h"
-#include <curand.h>
-#include <curand_kernel.h>
-namespace oneflow {
-__global__ void GenValues(int64_t* a, const int64_t low, const int64_t high, int32_t n,
-                          curandState* state) {
-  XPU_1D_KERNEL_LOOP(i, n) {
-    a[i] = curand(state + i) % (high - low)
-           + low;  //@TODO:curandState only generates 32-bit random number
-  }
-}
-
-class GpuRandintKernel final : public user_op::OpKernel {
- public:
-  GpuRandintKernel() = default;
-  ~GpuRandintKernel() = default;
-  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
-      user_op::KernelInitContext* ctx) const override {
-    const auto& generator = CHECK_JUST(one::MakeAutoGenerator());
-    generator->set_current_seed(ctx->Attr<int64_t>("seed"));
-    return std::make_shared<UniformKernelState>(generator);
-  }
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state) const override {
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    int64_t* output = out->mut_dptr<int64_t>();
-    auto* randint_kernel_state = dynamic_cast<UniformKernelState*>(state);
-    CHECK_NOTNULL(randint_kernel_state);
-    const auto& generator = randint_kernel_state->generator();
-    const auto& gpu_generator = CHECK_JUST(generator->Get<one::CUDAGeneratorImpl>());
-    CHECK_NOTNULL(generator);
-
-    int32_t block_num = gpu_generator->max_block_num();
-    int32_t thread_num = gpu_generator->max_thread_num();
-    curandState* curand_states = gpu_generator->curand_states();
-
-    const int32_t n = out->shape().elem_cnt();
-    const int64_t low = ctx->Attr<int64_t>("low");
-    const int64_t high = ctx->Attr<int64_t>("high");
-    GenValues<<<block_num, kCudaThreadsNumPerBlock, 0, ctx->device_ctx()->cuda_stream()>>>(
-        output, low, high, n, curand_states);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-REGISTER_USER_KERNEL("randint").SetCreateFn<GpuRandintKernel>().SetIsMatchedHob(
-    (user_op::HobDeviceTag() == "gpu"));
-
-}  // namespace oneflow
diff --git a/oneflow/user/ops/randint_op.cpp b/oneflow/user/ops/randint_op.cpp
deleted file mode 100644
index 2c4e44ca14e..00000000000
--- a/oneflow/user/ops/randint_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/common/protobuf.h"
-#include "oneflow/core/common/global.h"
-#include "oneflow/core/job/global_for.h"
-
-namespace oneflow {
-
-Maybe<void> InferRandintNdSbp(user_op::InferNdSbpFnContext* ctx);
-
-REGISTER_NO_GRAD_USER_OP("randint")
-    .Output("out")
-    .Attr<int64_t>("low")
-    .Attr<int64_t>("high")
-    .Attr<Shape>("shape")
-    .Attr<std::string>("nd_sbp")
-    .SetTensorDescInferFn([](user_op::InferContext* ctx) -> Maybe<void> {
-      Shape* out_shape = ctx->OutputShape("out", 0);
-      const Shape& shape = ctx->Attr<Shape>("shape");
-      DimVector dim_vec;
-      if (shape.NumAxes() > 0) {
-        dim_vec.insert(dim_vec.end(), shape.dim_vec().cbegin(), shape.dim_vec().cend());
-      }
-      *out_shape = Shape(dim_vec);
-      return Maybe<void>::Ok();
-    })
-    .SetGetSbpFn([](user_op::SbpContext* ctx) -> Maybe<void> { return Maybe<void>::Ok(); })
-    .SetDataTypeInferFn([](user_op::InferContext* ctx) -> Maybe<void> {
-      *ctx->OutputDType("out", 0) = DataType::kInt64;
-      return Maybe<void>::Ok();
-    })
-    .SetNdSbpInferFn(&InferRandintNdSbp);
-
-Maybe<void> InferRandintNdSbp(user_op::InferNdSbpFnContext* ctx) {
-  cfg::NdSbp* out = ctx->NdSbp4ArgNameAndIndex("out", 0);
-  if (JUST(*Global<Maybe<bool>, MultiClient>::Get())) {
-    const auto& pb_str = ctx->user_op_conf().attr<std::string>("nd_sbp");
-    NdSbp pb;
-    CHECK_OR_RETURN(TxtString2PbMessage(pb_str, &pb));
-    out->InitFromProto(pb);
-  } else {
-    out->mutable_sbp_parallel()->Add()->mutable_broadcast_parallel();
-  }
-  return Maybe<void>::Ok();
-}
-
-}  // namespace oneflow
diff --git a/python/oneflow/__init__.py b/python/oneflow/__init__.py
index 2b2dc766731..f3c2c529cff 100644
--- a/python/oneflow/__init__.py
+++ b/python/oneflow/__init__.py
@@ -22,7 +22,6 @@
 Size = oneflow._oneflow_internal.Size
 device = oneflow._oneflow_internal.device
 placement = oneflow._oneflow_internal.placement
-no_grad = oneflow._oneflow_internal.autograd.no_grad
 locals()["dtype"] = oneflow._oneflow_internal.dtype
 locals()["char"] = oneflow._oneflow_internal.char
 locals()["float16"] = oneflow._oneflow_internal.float16
@@ -114,6 +113,7 @@ def _SyncOnMasterFn():
 register_docstr()
 del register_docstr
 del docstr
+from oneflow.autograd import grad_enable, no_grad, inference_mode, is_grad_enabled
 import oneflow.nn.image
 import oneflow.nn.modules.acosh
 import oneflow.nn.modules.activation
@@ -257,8 +257,8 @@ def _SyncOnMasterFn():
 from oneflow.nn.modules.random_ops import bernoulli
 from oneflow.nn.modules.random_ops import rand_op as rand
 from oneflow.nn.modules.random_ops import randn_op as randn
-from oneflow.nn.modules.random_ops import randint
-from oneflow.nn.modules.random_ops import randperm
+from oneflow.nn.modules.random_ops import randint_op as randint
+from oneflow.nn.modules.random_ops import randperm_op as randperm
 from oneflow.nn.modules.reduce_ops import _max as max
 from oneflow.nn.modules.reduce_ops import _mean as mean
 from oneflow.nn.modules.reduce_ops import _min as min
diff --git a/python/oneflow/autograd/__init__.py b/python/oneflow/autograd/__init__.py
index 8452d5241fa..aaee33d6682 100644
--- a/python/oneflow/autograd/__init__.py
+++ b/python/oneflow/autograd/__init__.py
@@ -15,3 +15,9 @@
 """
 
 from oneflow.autograd.autograd import backward, grad
+from oneflow.autograd.autograd_mode import (
+    inference_mode,
+    grad_enable,
+    no_grad,
+    is_grad_enabled,
+)
diff --git a/python/oneflow/autograd/autograd_mode.py b/python/oneflow/autograd/autograd_mode.py
new file mode 100644
index 00000000000..49fcbb0212a
--- /dev/null
+++ b/python/oneflow/autograd/autograd_mode.py
@@ -0,0 +1,169 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.autograd import AutoGradMode
+
+
+def is_grad_enabled():
+    r"""
+    Returns True if grad mode is currently enabled.
+    """
+    return oneflow._oneflow_internal.autograd.is_grad_enabled()
+
+
+class inference_mode:
+    r"""
+    Context-manager that enables or disables inference mode
+
+    InferenceMode is a new context manager analogous to no_grad to be used when you arecertain
+    your operations will have no interactions with autograd (e.g., model training). Code run
+    under this mode gets better performance by disabling view tracking and version counter bumps.
+
+    This context manager is thread local; it will not affect computation in other threads.
+
+    Also functions as a decorator. (Make sure to instantiate with parenthesis.)
+
+    Args:
+        mode (bool): Flag whether to enable or disable inference mode. (default: True)
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> x = flow.ones(2, 3, requires_grad=True)
+        >>> with flow.inference_mode():
+        ...     y = x * x
+        >>> y.requires_grad
+        False
+        >>> @flow.inference_mode()
+        ... def no_grad_func(x):
+        ...     return x * x
+        >>> y = no_grad_func(x)
+        >>> y.requires_grad
+        False
+    """
+
+    def __init__(self, mode=True):
+        self.infer_mode = mode
+
+    def __call__(self, func):
+        def warpper(*args, **kwargs):
+            with AutoGradMode(not self.infer_mode):
+                return func(*args, **kwargs)
+
+        return warpper
+
+    def __enter__(self):
+        self.grad_mode = AutoGradMode(not self.infer_mode)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+class grad_enable:
+    r"""
+    Context-manager that enabled gradient calculation.
+
+    Enables gradient calculation, if it has been disabled via no_grad.
+
+    This context manager is thread local; it will not affect computation in other threads.
+
+    Also functions as a decorator. (Make sure to instantiate with parenthesis.)
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> x = flow.ones(2, 3, requires_grad=True)
+        >>> with flow.no_grad():
+        ...     with flow.grad_enable():
+        ...         y = x * x
+        >>> y.requires_grad
+        True
+        >>> @flow.grad_enable()
+        ... def no_grad_func(x):
+        ...     return x * x
+        >>> with flow.no_grad():
+        ...     y = no_grad_func(x)
+        >>> y.requires_grad
+        True
+    """
+
+    def __call__(self, func):
+        def warpper(*args, **kwargs):
+            with AutoGradMode(True):
+                return func(*args, **kwargs)
+
+        return warpper
+
+    def __enter__(self):
+        self.grad_mode = AutoGradMode(True)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+class no_grad:
+    r"""
+    Context-manager that disabled gradient calculation.
+
+    Disabling gradient calculation is useful for inference, when you are sure that
+    you will not call Tensor.backward(). It will reduce memory consumption for computations
+    that would otherwise have requires_grad=True.
+
+    In this mode, the result of every computation will have requires_grad=False, even when
+    the inputs have requires_grad=True.
+
+    This context manager is thread local; it will not affect computation in other threads.
+
+    Also functions as a decorator. (Make sure to instantiate with parenthesis.)
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> x = flow.ones(2, 3, requires_grad=True)
+        >>> with flow.no_grad():
+        ...     y = x * x
+        >>> y.requires_grad
+        False
+        >>> @flow.no_grad()
+        ... def no_grad_func(x):
+        ...     return x * x
+        >>> y = no_grad_func(x)
+        >>> y.requires_grad
+        False
+    """
+
+    def __call__(self, func):
+        def warpper(*args, **kwargs):
+            with AutoGradMode(False):
+                return func(*args, **kwargs)
+
+        return warpper
+
+    def __enter__(self):
+        self.grad_mode = AutoGradMode(False)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/compatible/single_client/__init__.py b/python/oneflow/compatible/single_client/__init__.py
index 3a8db4bd5df..2501f0b39f1 100644
--- a/python/oneflow/compatible/single_client/__init__.py
+++ b/python/oneflow/compatible/single_client/__init__.py
@@ -19,7 +19,6 @@
 Size = oneflow._oneflow_internal.Size
 device = oneflow._oneflow_internal.device
 placement = oneflow._oneflow_internal.placement
-no_grad = oneflow._oneflow_internal.autograd.no_grad
 locals()["dtype"] = oneflow._oneflow_internal.dtype
 locals()["char"] = oneflow._oneflow_internal.char
 locals()["float16"] = oneflow._oneflow_internal.float16
@@ -115,6 +114,7 @@ def custom_exit(returncode):
 sys.exit = custom_exit
 del custom_exit
 del sys
+from oneflow.compatible.single_client.autograd import no_grad
 import oneflow.compatible.single_client.nn.modules.acosh
 import oneflow.compatible.single_client.nn.modules.activation
 import oneflow.compatible.single_client.nn.modules.argwhere
diff --git a/python/oneflow/compatible/single_client/autograd/__init__.py b/python/oneflow/compatible/single_client/autograd/__init__.py
index 2cd8029edd4..79b6e3b3bb6 100644
--- a/python/oneflow/compatible/single_client/autograd/__init__.py
+++ b/python/oneflow/compatible/single_client/autograd/__init__.py
@@ -15,3 +15,4 @@
 """
 
 from oneflow.compatible.single_client.autograd.autograd import backward, grad
+from oneflow.compatible.single_client.autograd.autograd_mode import no_grad
diff --git a/python/oneflow/compatible/single_client/autograd/autograd_mode.py b/python/oneflow/compatible/single_client/autograd/autograd_mode.py
new file mode 100644
index 00000000000..b9d0d464a43
--- /dev/null
+++ b/python/oneflow/compatible/single_client/autograd/autograd_mode.py
@@ -0,0 +1,46 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import oneflow._oneflow_internal
+from oneflow._oneflow_internal.autograd import AutoGradMode
+
+
+class no_grad(AutoGradMode):
+    r"""
+    Context-manager that disabled gradient calculation.
+
+    Disabling gradient calculation is useful for inference, when you are sure that
+    you will not call Tensor.backward(). It will reduce memory consumption for computations
+    that would otherwise have requires_grad=True.
+
+    In this mode, the result of every computation will have requires_grad=False, even when
+    the inputs have requires_grad=True.
+
+    This context manager is thread local; it will not affect computation in other threads.
+
+    Also functions as a decorator. (Make sure to instantiate with parenthesis.)
+
+    """
+
+    def __init__(self):
+        super().__init__(False)
+
+    def __call__(self, func):
+        def warpper(*args, **kwargs):
+            with AutoGradMode(False):
+                return func(*args, **kwargs)
+
+        return warpper
diff --git a/python/oneflow/framework/tensor_str.py b/python/oneflow/framework/tensor_str.py
index 6ea41fb4dca..10ecfbbc8c3 100644
--- a/python/oneflow/framework/tensor_str.py
+++ b/python/oneflow/framework/tensor_str.py
@@ -36,7 +36,9 @@ class __PrinterOptions(object):
 
 def _try_convert_to_local_tensor(tensor):
     if tensor.is_consistent:
-        tensor = tensor.to_consistent(sbp=flow.sbp.broadcast).to_local()
+        tensor = tensor.to_consistent(
+            placement=tensor.placement, sbp=flow.sbp.broadcast
+        ).to_local()
     return tensor
 
 
@@ -214,8 +216,9 @@ def _cannot_print(sbp):
             and sbp != flow.sbp.split(0)
         )
 
-    # TODO: delete it when s1->b is ready
+    # TODO: delete it when boxing on "CPU" and s1->b on "GPU" are ready
     if self.is_consistent:
+        self = self.to("cuda")
         if all(_cannot_print(sbp) for sbp in self.sbp):
             return "[...]"
 
diff --git a/python/oneflow/nn/modules/adaptive_pool.py b/python/oneflow/nn/modules/adaptive_pool.py
index ef0a37c7557..fb6ccd18960 100644
--- a/python/oneflow/nn/modules/adaptive_pool.py
+++ b/python/oneflow/nn/modules/adaptive_pool.py
@@ -15,27 +15,23 @@
 """
 import oneflow as flow
 from oneflow.nn.module import Module
+from oneflow.nn.common_types import _size_1_t
+from oneflow.nn.modules.utils import _single, _pair, _triple
 
 
 def _generate_output_size(input_size, output_size):
     new_output_size = []
-    if isinstance(output_size, int):
-        for _ in range(len(input_size) - 2):
-            new_output_size.append(output_size)
-    elif isinstance(output_size, tuple):
-        assert len(input_size) - 2 == len(
-            output_size
-        ), f"The length of 'output_size' does not match the input size, {len(input_size) - 2} expected"
-        for i in range(len(output_size)):
-            if output_size[i] is None:
-                new_output_size.append(input_size[i + 2])
-            else:
-                assert isinstance(
-                    output_size[i], int
-                ), "numbers in 'output_size' should be integer"
-                new_output_size.append(output_size[i])
-    else:
-        raise ValueError("invalid 'output_size', 'int' or 'tuple' expected")
+    assert len(input_size) - 2 == len(
+        output_size
+    ), f"the length of 'output_size' does not match the input size, {len(input_size) - 2} expected"
+    for i in range(len(output_size)):
+        if output_size[i] is None:
+            new_output_size.append(input_size[i + 2])
+        else:
+            assert isinstance(
+                output_size[i], int
+            ), "numbers in 'output_size' should be integer"
+            new_output_size.append(output_size[i])
     return tuple(new_output_size)
 
 
@@ -55,7 +51,7 @@ class AdaptiveAvgPool1d(Module):
         >>> import numpy as np
         >>> import oneflow as flow
         >>> import oneflow.nn as nn
-        
+
         >>> m = nn.AdaptiveAvgPool1d(5)
         >>> input = flow.Tensor(np.random.randn(1, 64, 8))
         >>> output = m(input)
@@ -64,19 +60,19 @@ class AdaptiveAvgPool1d(Module):
 
     """
 
-    def __init__(self, output_size) -> None:
+    def __init__(self, output_size: _size_1_t) -> None:
         super().__init__()
-        self.output_size = output_size
+        assert output_size is not None, "'output_size' cannot be NoneType"
+        self.output_size = _single(output_size)
 
     def forward(self, x):
-        assert len(x.shape) == 3
-        if isinstance(self.output_size, tuple):
-            new_output_size = self.output_size[0]
-        elif isinstance(self.output_size, int):
-            new_output_size = self.output_size
-        else:
-            raise ValueError("'output_size' should be integer or tuple")
-        return flow.F.adaptive_avg_pool1d(x, output_size=(new_output_size,))
+        assert (
+            len(x.shape) == 3 and len(self.output_size) == 1
+        ), "the length of 'output_size' does not match the input size, 1 expected"
+        assert isinstance(
+            self.output_size[0], int
+        ), "numbers in 'output_size' should be integer"
+        return flow.F.adaptive_avg_pool1d(x, output_size=self.output_size)
 
 
 def adaptive_avg_pool1d(input, output_size):
@@ -110,7 +106,7 @@ class AdaptiveAvgPool2d(Module):
         >>> import numpy as np
         >>> import oneflow as flow
         >>> import oneflow.nn as nn
-        
+
         >>> m = nn.AdaptiveAvgPool2d((5,7))
         >>> input = flow.Tensor(np.random.randn(1, 64, 8, 9))
         >>> output = m(input)
@@ -133,10 +129,13 @@ class AdaptiveAvgPool2d(Module):
 
     def __init__(self, output_size) -> None:
         super().__init__()
-        self.output_size = output_size
+        assert output_size is not None, "'output_size' cannot be NoneType"
+        self.output_size = _pair(output_size)
 
     def forward(self, x):
-        assert len(x.shape) == 4
+        assert (
+            len(x.shape) == 4
+        ), f"expected 4-dimensional tensor, but got {len(x.shape)}-dimensional tensor"
         new_output_size = _generate_output_size(x.shape, self.output_size)
         return flow.F.adaptive_avg_pool2d(x, output_size=new_output_size)
 
@@ -172,7 +171,7 @@ class AdaptiveAvgPool3d(Module):
         >>> import numpy as np
         >>> import oneflow as flow
         >>> import oneflow.nn as nn
-        
+
         >>> m = nn.AdaptiveAvgPool3d((5,7,9))
         >>> input = flow.Tensor(np.random.randn(1, 64, 8, 9, 10))
         >>> output = m(input)
@@ -195,10 +194,13 @@ class AdaptiveAvgPool3d(Module):
 
     def __init__(self, output_size) -> None:
         super().__init__()
-        self.output_size = output_size
+        assert output_size is not None, "'output_size' cannot be NoneType"
+        self.output_size = _triple(output_size)
 
     def forward(self, x):
-        assert len(x.shape) == 5
+        assert (
+            len(x.shape) == 5
+        ), f"expected 5-dimensional tensor, but got {len(x.shape)}-dimensional tensor"
         new_output_size = _generate_output_size(x.shape, self.output_size)
         return flow.F.adaptive_avg_pool3d(x, output_size=new_output_size)
 
diff --git a/python/oneflow/nn/modules/consistent_cast.py b/python/oneflow/nn/modules/consistent_cast.py
index 0000cf2dd15..e37ada9f778 100644
--- a/python/oneflow/nn/modules/consistent_cast.py
+++ b/python/oneflow/nn/modules/consistent_cast.py
@@ -35,9 +35,7 @@ def forward(self, x, sbp, placement):
 
 
 @register_tensor_op("to_consistent")
-def to_consistent_op(
-    input, placement=None, sbp=None, identity_grad=False, grad_sbp=None
-):
+def to_consistent_op(input, placement=None, sbp=None, grad_sbp=None):
     """Cast a local tensor to consistent tensor or cast a
     consistent tensor to another consistent tensor with 
     different sbp or placement
@@ -90,9 +88,6 @@ def _check_sbp(sbp):
 
         grad_sbp = _check_sbp(grad_sbp)
 
-        if identity_grad is True and grad_sbp is not None:
-            raise ValueError("If identity_grad is True, grad_sbp should not be set.")
-
     else:
         # local tensor to consistent tensor
         if placement is None or sbp is None:
@@ -106,7 +101,7 @@ def _check_sbp(sbp):
     if grad_sbp is None:
         grad_sbp = tuple()
 
-    return flow.F.to_consistent(input, placement, sbp, identity_grad, grad_sbp)
+    return flow.F.to_consistent(input, placement, sbp, grad_sbp)
 
 
 class ToLocal(Module):
diff --git a/python/oneflow/nn/modules/random_ops.py b/python/oneflow/nn/modules/random_ops.py
index 71225a34ec6..af0f2eb96f0 100644
--- a/python/oneflow/nn/modules/random_ops.py
+++ b/python/oneflow/nn/modules/random_ops.py
@@ -264,52 +264,14 @@ def randn_op(
     )()
 
 
-
-class Randint(flow.nn.Module):
+class RandInt(flow.nn.Module):
     def __init__(
         self,
         low: flow.int64,
         high: flow.int64,
         size: tuple,
         generator: flow.Generator = None,
-        dtype: flow.dtype = flow.int64,
-        layout=None,
-        device=None,
-        placement=None,
-        sbp=None,
-        requires_grad=False,
-    ) -> None:
-        super().__init__()
-
-        if generator is None:
-            generator = flow.Generator()
-        assert low < high
-
-class Randperm(Module):
-    def __init__(
-        self,
-        n,
-        generator: flow.Generator = None,
-        dtype: flow.dtype = flow.int32,
-        layout=None,
-        device: Union[flow.device, str, None] = None,
-        placement: flow.placement = None,
-        sbp: flow._oneflow_internal.sbp.sbp = None,
-        requires_grad: bool = False,
-        pin_memory: bool = False,
-    ) -> None:
-        super().__init__()
-        assert n >= 0
-        self.n = n
-
- class Randint(flow.nn.Module):
-    def __init__(
-        self,
-        low: flow.int64,
-        high: flow.int64,
-        size: tuple,
-        generator: flow.Generator = None,
-        dtype: flow.dtype = flow.int64,
+        dtype: Optional[flow.dtype] = None,
         layout=None,
         device=None,
         placement=None,
@@ -328,7 +290,6 @@ def __init__(
             self.generator,
             self.placement,
             self.sbp,
-
         ) = _rand_op_common_process(size, device, generator, placement, sbp)
         self.dtype = dtype
         self.low = low
@@ -337,93 +298,124 @@ def __init__(
     def forward(self):
         if self.placement is not None:
             res = flow.F.consistent_randint(
-                self.low, self.high, self.size, self.placement, self.sbp, self.generator
+                self.low,
+                self.high,
+                self.size,
+                self.placement,
+                self.sbp,
+                self.dtype,
+                self.generator,
             )
         else:
             res = flow.F.randint(
-                self.low, self.high, self.size, self.device, self.generator
+                self.low, self.high, self.size, self.dtype, self.device, self.generator
             )
         res.requires_grad = self.requires_grad
         return res
 
 
-def randint(
-    low: flow.int64 = 0,
-    high: Union[int, tuple] = None,
-    size: tuple = None,
-    generator: flow.Generator = None,
-    dtype: flow.dtype = flow.int64,
+def randint_op(
+    low: flow.int64,
+    high: flow.int64,
+    size: tuple,
+    out=None,
+    generator=None,
+    dtype: Optional[flow.dtype] = None,
+    layout=None,
+    device: Union[flow.device, str, None] = None,
+    placement: flow.placement = None,
+    sbp: flow._oneflow_internal.sbp.sbp = None,
+    requires_grad: bool = False,
+):
+    """
+    Returns a tensor filled with random integers generated uniformly between low (inclusive) and high (exclusive).
 
-        ) = _rand_op_common_process(1, device, generator, placement, sbp)
-        self.dtype = dtype
+    The shape of the tensor is defined by the variable argument ``size``.
+
+    Args:
+        size (int... or flow.Size): Defining the shape of the output tensor.
+          Can be a variable number of arguments or a collection like a list or tuple or flow.Size.
+        out (optional): The output tensor.
+        dtype (flow.dtype, optional): The desired data type of returned tensor. Default: ``flow.int64``.
+        layout (optional): The desired layout of returned Tensor.
+        generator (flow.Generator, optional) – a pseudorandom number generator for sampling
+        device (flow.device, optional): The desired device of returned local tensor. If None, uses the
+          current device.
+        placement (flow.placement, optional): The desired device of returned consistent tensor. If None, will
+          construct local tensor.
+        sbp (flow.sbp, optional): The desired sbp of returned consistent tensor. It must be equal with the
+          numbers of placement.
+        requires_grad (bool, optional): If autograd should record operations on the returned tensor. Default: False.
+
+    For example:
+
+    .. code-block:: python
+        >>> import oneflow as flow
+        >>> generator = flow.Generator()
+        >>> generator.manual_seed(0)
+        >>> flow.randint(5, generator=generator)
+        tensor([2, 4, 3, 0, 1], dtype=oneflow.int32)
+
+    """
+    assert out is None, "out not supported yet"
+    assert layout is None, "layout not supported yet"
+    if generator is None:
+        generator = flow.default_generator()
+    return RandInt(
+        low, high, size, generator, dtype, layout, device, placement, sbp, requires_grad
+    )()
+
+
+class RandPerm(Module):
+    def __init__(
+        self,
+        n,
+        generator: flow.Generator = None,
+        dtype: Optional[flow.dtype] = None,
+        layout=None,
+        device: Union[flow.device, str, None] = None,
+        placement: flow.placement = None,
+        sbp: flow._oneflow_internal.sbp.sbp = None,
+        requires_grad: bool = False,
+        pin_memory: bool = False,
+    ) -> None:
+        super().__init__()
+        assert n >= 0
+        self.n = n
 
     def forward(self, out=None):
         if self.placement is not None:
             res = flow.F.consistent_randperm(
-                self.n, self.placement, self.sbp, self.generator
+                self.n, self.placement, self.sbp, self.dtype, self.generator
             )
         else:
-            res = flow.F.randperm(self.n, self.device, self.generator)
+            res = flow.F.randperm(self.n, self.dtype, self.device, self.generator)
         res.requires_grad = self.requires_grad
-        return res.to(dtype=self.dtype)
+        return res
 
 
-def randperm(
+def randperm_op(
     n: flow.int32,
     generator: flow.Generator = None,
     out=None,
-    dtype: flow.dtype = flow.int32,
-
+    dtype: Optional[flow.dtype] = None,
     layout=None,
     device: Union[flow.device, str, None] = None,
     placement: flow.placement = None,
     sbp: flow._oneflow_internal.sbp.sbp = None,
     requires_grad: bool = False,
-
 ) -> flow.Tensor:
-    r"""Returns a tensor filled with random integers generated uniformly from  :math:`[ \text{low},\text{high} )`.
-    
-
-    The shape of the tensor is defined by the variable argument size.
-
-    Args:
-        low (int, optional):Lowest integer to be drawn from the distribution. Default: 0.
-
-        high (int):One above the highest integer to be drawn from the distribution.
-
-        size (tuple):a tuple defining the shape of the output tensor.
-   
-    Keyword args:
-        generator(:class:`oneflow.Generator`, optional):  a pseudorandom number generator for sampling
-        dtype (:class:`oneflow.dtype`, optional): the desired data type of returned tensor.
-            Default: ``oneflow.int64``.
-        layout: layout is not supported yet.
-        device: the desired device of returned tensor. Default: cpu.
-        requires_grad(bool, optional): If autograd should record operations on the returned tensor. Default: False.
-        placement (flow.placement, optional): The desired device of returned consistent tensor. If None, will
-          construct local tensor.
-        sbp (flow.sbp, optional): The desired sbp of returned consistent tensor. It must be equal with the
-          numbers of placement.
-        requires_grad (bool, optional): If autograd should record operations on the returned tensor. Default: False.
-
-    Returns:
-        oneflow.Tensor: The result Tensor of given size.
-
-    For example:
-
-    pin_memory: bool = False,
-):
     r"""
     Returns a random permutation of integers from ``0`` to ``n - 1``.
 
     Args:
         n (int): the upper bound (exclusive)
-    
+
     Keyword args:
         generator(:class:`oneflow.Generator`, optional):  a pseudorandom number generator for sampling
         out (Tensor, optional): output Tensor,not supported yet.
         dtype (:class:`oneflow.dtype`, optional): the desired data type of returned tensor.
-            Default: ``oneflow.int32``.
+            Default: ``oneflow.int64``.
         layout: layout is not supported yet.
         device: the desired device of returned tensor. Default: cpu.
         placement:(:class:`flow.placement`, optional): The desired device of returned consistent tensor. If None,
@@ -435,29 +427,9 @@ def randperm(
 
     Example:
 
-
     .. code-block:: python
 
         >>> import oneflow as flow
-
-        >>> import numpy as np
-        >>> generator = flow.Generator()
-        >>> generator.manual_seed(0)
-        >>> flow.randint(10,(1,10),generator=generator)
-        tensor([[5, 5, 7, 8, 6, 8, 5, 8, 4, 6]], dtype=oneflow.int64)
-    """
-    assert layout is None, "layout not supported yet"
-    if type(high) is tuple:
-        size = high
-        low, high = 0, low
-    if len(size) == 0:
-        size = (1,)
-    if generator is None:
-        generator = flow.default_generator()
-    return Randint(
-        low, high, size, generator, dtype, layout, device, placement, sbp, requires_grad
-    )()
-
         >>> generator = flow.Generator()
         >>> generator.manual_seed(0)
         >>> flow.randperm(5, generator=generator)
@@ -467,12 +439,11 @@ def randperm(
     assert layout is None, "layout not supported yet"
     if generator is None:
         generator = flow.default_generator()
-    return Randperm(
+    return RandPerm(
         n, generator, dtype, layout, device, placement, sbp, requires_grad, pin_memory
     )(out)
 
 
-
 if __name__ == "__main__":
     import doctest
 
diff --git a/python/oneflow/nn/parallel/ddp.py b/python/oneflow/nn/parallel/ddp.py
index 09cd4bc1c36..bec806dabd2 100644
--- a/python/oneflow/nn/parallel/ddp.py
+++ b/python/oneflow/nn/parallel/ddp.py
@@ -40,9 +40,18 @@ def allreduce(grad):
     return allreduce
 
 
-def DistributedDataParallel(module: "flow.nn.Module"):
+def DistributedDataParallel(
+    module: "flow.nn.Module", *, broadcast_buffers: bool = True
+):
     world_size = flow.distributed.get_world_size()
-    # TODO(jianhao): broadcast parameters and buffers
+    with flow.no_grad():
+        for x in module.parameters():
+            requires_grad = x.requires_grad
+            x.copy_(flow.F.broadcast(x))
+            # TODO: fix the bug that x's requires_grad is discarded
+            # after flow.F.broadcast
+            x.requires_grad_(requires_grad)
+
     ddp_state_for_reversed_params = OrderedDict(
         reversed([(x, [False, False]) for x in module.parameters()])
     )
@@ -51,7 +60,7 @@ def DistributedDataParallel(module: "flow.nn.Module"):
         param.register_hook(lambda grad: grad / world_size)
         param.register_hook(allreduce_fn(ddp_state_for_reversed_params, param))
 
-    def hook(module, input, output):
+    def post_forward_hook(module, input, output):
         ddp_state_for_reversed_params = module._ddp_state_for_reversed_params
         for state in ddp_state_for_reversed_params.values():
             state[0], state[1] = False, False
@@ -60,5 +69,15 @@ def hook(module, input, output):
         )
         return output
 
-    module.register_forward_hook(hook)
+    module.register_forward_hook(post_forward_hook)
+
+    if broadcast_buffers:
+
+        def pre_forward_hook(module, input):
+            with flow.no_grad():
+                for x in module.buffers():
+                    x.copy_(flow.F.broadcast(x))
+
+        module.register_forward_pre_hook(pre_forward_hook)
+
     return module
diff --git a/python/oneflow/test/modules/test_adaptive_pool.py b/python/oneflow/test/modules/test_adaptive_pool.py
index decdfab86cf..a27f131d5c3 100644
--- a/python/oneflow/test/modules/test_adaptive_pool.py
+++ b/python/oneflow/test/modules/test_adaptive_pool.py
@@ -13,885 +13,60 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-
 import unittest
-from collections import OrderedDict
-
-import numpy as np
-from test_util import GenArgList
 
 import oneflow as flow
 import oneflow.unittest
+from oneflow.nn.common_types import _size_1_t
+from packaging import version
+import torch as torch_original
+from typing import Union, Tuple
 
+from automated_test_util import *
 
-def _test_adaptive_avgpool1d_forward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        0.05580734834074974,
-                        -0.6875145435333252,
-                        -1.654430866241455,
-                        -0.6225992441177368,
-                        0.10183599591255188,
-                        0.05019790679216385,
-                        -1.2537643909454346,
-                        0.14907236397266388,
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-    )
-    m = flow.nn.AdaptiveAvgPool1d(4)
-    m.to(device)
-    of_out_1 = m(input)
-    of_out_2 = flow.adaptive_avg_pool1d(input, 4)
-    np_out = np.array(
-        [
-            [
-                [
-                    -0.3158535957336426,
-                    -1.1385149955749512,
-                    0.07601694762706757,
-                    -0.5523459911346436,
-                ]
-            ]
-        ]
-    )
-    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05))
-    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05))
-
-
-def _test_adaptive_avgpool1d_backward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        0.05580734834074974,
-                        -0.6875145435333252,
-                        -1.654430866241455,
-                        -0.6225992441177368,
-                        0.10183599591255188,
-                        0.05019790679216385,
-                        -1.2537643909454346,
-                        0.14907236397266388,
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    m = flow.nn.AdaptiveAvgPool1d(4)
-    of_out = m(input)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = np.array([[[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]]])
-    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
-
-
-@unittest.skipIf(
-    not flow.unittest.env.eager_execution_enabled(),
-    ".numpy() doesn't work in lazy mode",
-)
-def _test_adaptive_avgpool2d_forward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        [
-                            0.10039155930280685,
-                            0.04879157617688179,
-                            -1.0515470504760742,
-                            0.9466001987457275,
-                        ],
-                        [
-                            0.45375481247901917,
-                            0.23611211776733398,
-                            1.343685269355774,
-                            0.3979687988758087,
-                        ],
-                        [
-                            0.05580734834074974,
-                            -0.6875145435333252,
-                            -1.654430866241455,
-                            -0.6225992441177368,
-                        ],
-                        [
-                            0.10183599591255188,
-                            0.05019790679216385,
-                            -1.2537643909454346,
-                            0.14907236397266388,
-                        ],
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-    )
-    m = flow.nn.AdaptiveAvgPool2d((2, 2))
-    m.to(device)
-    of_out_1 = m(input)
-    of_out_2 = flow.adaptive_avg_pool2d(input, (2, 2))
-    np_out = np.array(
-        [
-            [
-                [
-                    [0.20976251363754272, 0.4091767966747284],
-                    [-0.1199183315038681, -0.8454304933547974],
-                ]
-            ]
-        ]
-    )
-    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05))
-    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05))
-
-
-def _test_adaptive_avgpool2d_backward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        [
-                            0.10039155930280685,
-                            0.04879157617688179,
-                            -1.0515470504760742,
-                            0.9466001987457275,
-                        ],
-                        [
-                            0.45375481247901917,
-                            0.23611211776733398,
-                            1.343685269355774,
-                            0.3979687988758087,
-                        ],
-                        [
-                            0.05580734834074974,
-                            -0.6875145435333252,
-                            -1.654430866241455,
-                            -0.6225992441177368,
-                        ],
-                        [
-                            0.10183599591255188,
-                            0.05019790679216385,
-                            -1.2537643909454346,
-                            0.14907236397266388,
-                        ],
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    m = flow.nn.AdaptiveAvgPool2d((2, 2))
-    of_out = m(input)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = np.array(
-        [
-            [
-                [
-                    [0.25, 0.25, 0.25, 0.25],
-                    [0.25, 0.25, 0.25, 0.25],
-                    [0.25, 0.25, 0.25, 0.25],
-                    [0.25, 0.25, 0.25, 0.25],
-                ]
-            ]
-        ]
-    )
-    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
-
-
-def _test_adaptive_avgpool2d_hw_forward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        [0.28242185711860657, -0.7742040753364563, -0.5439430475234985],
-                        [-0.1706847995519638, 0.0430854931473732, 0.34247592091560364],
-                        [-1.036131501197815, -1.033642292022705, 0.3455536365509033],
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-    )
-    m = flow.nn.AdaptiveAvgPool2d((1, 2))
-    m.to(device)
-    of_out = m(input)
-    np_out = np.array([[[[-0.4481925666332245, -0.27011242508888245]]]])
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
-
-
-def _test_adaptive_avgpool2d_hw_backward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        [0.28242185711860657, -0.7742040753364563, -0.5439430475234985],
-                        [-0.1706847995519638, 0.0430854931473732, 0.34247592091560364],
-                        [-1.036131501197815, -1.033642292022705, 0.3455536365509033],
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    m = flow.nn.AdaptiveAvgPool2d((1, 2))
-    of_out = m(input)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = np.array(
-        [
-            [
-                [
-                    [0.1666666716337204, 0.3333333432674408, 0.1666666716337204],
-                    [0.1666666716337204, 0.3333333432674408, 0.1666666716337204],
-                    [0.1666666716337204, 0.3333333432674408, 0.1666666716337204],
-                ]
-            ]
-        ]
-    )
-    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
-
-
-def _test_adaptive_avgpool3d_forward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        [
-                            [
-                                -1.077571799600885,
-                                -0.7804538890365837,
-                                -1.2627538752119443,
-                                0.9993507145120477,
-                            ],
-                            [
-                                2.0222532489157516,
-                                1.103451377699465,
-                                -0.4377324754879578,
-                                1.890491810587517,
-                            ],
-                            [
-                                -0.5593861899064654,
-                                -0.4949520241526519,
-                                -0.18536721363519787,
-                                -0.6098969866775772,
-                            ],
-                            [
-                                -1.6536215260171816,
-                                -1.0392583540436786,
-                                0.3686776597613967,
-                                -0.5356882834951805,
-                            ],
-                        ],
-                        [
-                            [
-                                -1.2617900664449953,
-                                -1.4390921091631532,
-                                0.20654399652431357,
-                                0.8186472101906713,
-                            ],
-                            [
-                                -0.3033378863400014,
-                                -0.8173269764076293,
-                                -0.3767515097625614,
-                                -0.11021655039337777,
-                            ],
-                            [
-                                -0.22977043608192885,
-                                1.2717196366649905,
-                                -0.4790851297878291,
-                                -1.4495369404727856,
-                            ],
-                            [
-                                -1.2802093286977783,
-                                -0.11184514806663474,
-                                1.7022167087210984,
-                                -1.7354837287725355,
-                            ],
-                        ],
-                        [
-                            [
-                                2.4706497991773606,
-                                -0.6549702631973298,
-                                -0.9318107079571676,
-                                1.4652904271682428,
-                            ],
-                            [
-                                1.1419864234341397,
-                                1.389909081086008,
-                                0.9657841900525568,
-                                -0.8563114264976619,
-                            ],
-                            [
-                                0.19515087084250754,
-                                -0.37808457398571094,
-                                0.2938625398496183,
-                                0.9279930510353327,
-                            ],
-                            [
-                                -0.9374118277994007,
-                                0.3341831730452431,
-                                -0.2792542765303833,
-                                0.38029090707066726,
-                            ],
-                        ],
-                        [
-                            [
-                                0.5918686659736041,
-                                -0.7870631089938902,
-                                -0.9534344874245392,
-                                0.31341612954718795,
-                            ],
-                            [
-                                0.7509029444145228,
-                                -0.9299288398562323,
-                                -0.7343054052782476,
-                                -0.8806481590696694,
-                            ],
-                            [
-                                -0.4707853016353985,
-                                0.12253641652645629,
-                                0.5088022039832846,
-                                0.520391789327562,
-                            ],
-                            [
-                                -0.0861300651163632,
-                                0.30291348404866386,
-                                -0.6268565873680123,
-                                -0.27469204305759976,
-                            ],
-                        ],
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-    )
-    m = flow.nn.AdaptiveAvgPool3d((2, 2, 2))
-    m.to(device)
-    of_out_1 = m(input)
-    of_out_2 = flow.adaptive_avg_pool3d(input, (2, 2, 2))
-    np_out = np.array(
-        [
-            [
-                [
-                    [
-                        [-0.3192335125472539, 0.2159474151198386],
-                        [-0.5121654212876662, -0.3655204892948264],
-                    ],
-                    [
-                        [0.4966693377547728, -0.2015024299324123],
-                        [-0.11470347800925032, 0.18131719803880864],
-                    ],
-                ]
-            ]
-        ]
-    )
-    test_case.assertTrue(np.allclose(of_out_1.numpy(), np_out, 1e-05, 1e-05))
-    test_case.assertTrue(np.allclose(of_out_2.numpy(), np_out, 1e-05, 1e-05))
-
-
-def _test_adaptive_avgpool3d_backward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        [
-                            [
-                                -1.077571799600885,
-                                -0.7804538890365837,
-                                -1.2627538752119443,
-                                0.9993507145120477,
-                            ],
-                            [
-                                2.0222532489157516,
-                                1.103451377699465,
-                                -0.4377324754879578,
-                                1.890491810587517,
-                            ],
-                            [
-                                -0.5593861899064654,
-                                -0.4949520241526519,
-                                -0.18536721363519787,
-                                -0.6098969866775772,
-                            ],
-                            [
-                                -1.6536215260171816,
-                                -1.0392583540436786,
-                                0.3686776597613967,
-                                -0.5356882834951805,
-                            ],
-                        ],
-                        [
-                            [
-                                -1.2617900664449953,
-                                -1.4390921091631532,
-                                0.20654399652431357,
-                                0.8186472101906713,
-                            ],
-                            [
-                                -0.3033378863400014,
-                                -0.8173269764076293,
-                                -0.3767515097625614,
-                                -0.11021655039337777,
-                            ],
-                            [
-                                -0.22977043608192885,
-                                1.2717196366649905,
-                                -0.4790851297878291,
-                                -1.4495369404727856,
-                            ],
-                            [
-                                -1.2802093286977783,
-                                -0.11184514806663474,
-                                1.7022167087210984,
-                                -1.7354837287725355,
-                            ],
-                        ],
-                        [
-                            [
-                                2.4706497991773606,
-                                -0.6549702631973298,
-                                -0.9318107079571676,
-                                1.4652904271682428,
-                            ],
-                            [
-                                1.1419864234341397,
-                                1.389909081086008,
-                                0.9657841900525568,
-                                -0.8563114264976619,
-                            ],
-                            [
-                                0.19515087084250754,
-                                -0.37808457398571094,
-                                0.2938625398496183,
-                                0.9279930510353327,
-                            ],
-                            [
-                                -0.9374118277994007,
-                                0.3341831730452431,
-                                -0.2792542765303833,
-                                0.38029090707066726,
-                            ],
-                        ],
-                        [
-                            [
-                                0.5918686659736041,
-                                -0.7870631089938902,
-                                -0.9534344874245392,
-                                0.31341612954718795,
-                            ],
-                            [
-                                0.7509029444145228,
-                                -0.9299288398562323,
-                                -0.7343054052782476,
-                                -0.8806481590696694,
-                            ],
-                            [
-                                -0.4707853016353985,
-                                0.12253641652645629,
-                                0.5088022039832846,
-                                0.520391789327562,
-                            ],
-                            [
-                                -0.0861300651163632,
-                                0.30291348404866386,
-                                -0.6268565873680123,
-                                -0.27469204305759976,
-                            ],
-                        ],
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    m = flow.nn.AdaptiveAvgPool3d((2, 2, 2))
-    of_out = m(input)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = np.array(
-        [
-            [
-                [
-                    [
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                    ],
-                    [
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                    ],
-                    [
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                    ],
-                    [
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                        [0.125, 0.125, 0.125, 0.125],
-                    ],
-                ]
-            ]
-        ]
-    )
-    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
-
-
-def _test_adaptive_avgpool3d_dhw_forward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        [
-                            [
-                                -1.077571799600885,
-                                -0.7804538890365837,
-                                -1.2627538752119443,
-                                0.9993507145120477,
-                            ],
-                            [
-                                2.0222532489157516,
-                                1.103451377699465,
-                                -0.4377324754879578,
-                                1.890491810587517,
-                            ],
-                            [
-                                -0.5593861899064654,
-                                -0.4949520241526519,
-                                -0.18536721363519787,
-                                -0.6098969866775772,
-                            ],
-                            [
-                                -1.6536215260171816,
-                                -1.0392583540436786,
-                                0.3686776597613967,
-                                -0.5356882834951805,
-                            ],
-                        ],
-                        [
-                            [
-                                -1.2617900664449953,
-                                -1.4390921091631532,
-                                0.20654399652431357,
-                                0.8186472101906713,
-                            ],
-                            [
-                                -0.3033378863400014,
-                                -0.8173269764076293,
-                                -0.3767515097625614,
-                                -0.11021655039337777,
-                            ],
-                            [
-                                -0.22977043608192885,
-                                1.2717196366649905,
-                                -0.4790851297878291,
-                                -1.4495369404727856,
-                            ],
-                            [
-                                -1.2802093286977783,
-                                -0.11184514806663474,
-                                1.7022167087210984,
-                                -1.7354837287725355,
-                            ],
-                        ],
-                        [
-                            [
-                                2.4706497991773606,
-                                -0.6549702631973298,
-                                -0.9318107079571676,
-                                1.4652904271682428,
-                            ],
-                            [
-                                1.1419864234341397,
-                                1.389909081086008,
-                                0.9657841900525568,
-                                -0.8563114264976619,
-                            ],
-                            [
-                                0.19515087084250754,
-                                -0.37808457398571094,
-                                0.2938625398496183,
-                                0.9279930510353327,
-                            ],
-                            [
-                                -0.9374118277994007,
-                                0.3341831730452431,
-                                -0.2792542765303833,
-                                0.38029090707066726,
-                            ],
-                        ],
-                        [
-                            [
-                                0.5918686659736041,
-                                -0.7870631089938902,
-                                -0.9534344874245392,
-                                0.31341612954718795,
-                            ],
-                            [
-                                0.7509029444145228,
-                                -0.9299288398562323,
-                                -0.7343054052782476,
-                                -0.8806481590696694,
-                            ],
-                            [
-                                -0.4707853016353985,
-                                0.12253641652645629,
-                                0.5088022039832846,
-                                0.520391789327562,
-                            ],
-                            [
-                                -0.0861300651163632,
-                                0.30291348404866386,
-                                -0.6268565873680123,
-                                -0.27469204305759976,
-                            ],
-                        ],
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-    )
-    m = flow.nn.AdaptiveAvgPool3d((1, 2, 3))
-    m.to(device)
-    of_out = m(input)
-    np_out = np.array(
-        [
-            [
-                [
-                    [0.08871791260375947, -0.4024959376509308, 0.00722249259371315],
-                    [-0.31343444964845824, 0.08188803218941582, -0.09210164562800888],
-                ]
-            ]
-        ]
-    )
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
-
-
-def _test_adaptive_avgpool3d_dhw_backward(test_case, device):
-    input = flow.Tensor(
-        np.array(
-            [
-                [
-                    [
-                        [
-                            [
-                                -1.077571799600885,
-                                -0.7804538890365837,
-                                -1.2627538752119443,
-                                0.9993507145120477,
-                            ],
-                            [
-                                2.0222532489157516,
-                                1.103451377699465,
-                                -0.4377324754879578,
-                                1.890491810587517,
-                            ],
-                            [
-                                -0.5593861899064654,
-                                -0.4949520241526519,
-                                -0.18536721363519787,
-                                -0.6098969866775772,
-                            ],
-                            [
-                                -1.6536215260171816,
-                                -1.0392583540436786,
-                                0.3686776597613967,
-                                -0.5356882834951805,
-                            ],
-                        ],
-                        [
-                            [
-                                -1.2617900664449953,
-                                -1.4390921091631532,
-                                0.20654399652431357,
-                                0.8186472101906713,
-                            ],
-                            [
-                                -0.3033378863400014,
-                                -0.8173269764076293,
-                                -0.3767515097625614,
-                                -0.11021655039337777,
-                            ],
-                            [
-                                -0.22977043608192885,
-                                1.2717196366649905,
-                                -0.4790851297878291,
-                                -1.4495369404727856,
-                            ],
-                            [
-                                -1.2802093286977783,
-                                -0.11184514806663474,
-                                1.7022167087210984,
-                                -1.7354837287725355,
-                            ],
-                        ],
-                        [
-                            [
-                                2.4706497991773606,
-                                -0.6549702631973298,
-                                -0.9318107079571676,
-                                1.4652904271682428,
-                            ],
-                            [
-                                1.1419864234341397,
-                                1.389909081086008,
-                                0.9657841900525568,
-                                -0.8563114264976619,
-                            ],
-                            [
-                                0.19515087084250754,
-                                -0.37808457398571094,
-                                0.2938625398496183,
-                                0.9279930510353327,
-                            ],
-                            [
-                                -0.9374118277994007,
-                                0.3341831730452431,
-                                -0.2792542765303833,
-                                0.38029090707066726,
-                            ],
-                        ],
-                        [
-                            [
-                                0.5918686659736041,
-                                -0.7870631089938902,
-                                -0.9534344874245392,
-                                0.31341612954718795,
-                            ],
-                            [
-                                0.7509029444145228,
-                                -0.9299288398562323,
-                                -0.7343054052782476,
-                                -0.8806481590696694,
-                            ],
-                            [
-                                -0.4707853016353985,
-                                0.12253641652645629,
-                                0.5088022039832846,
-                                0.520391789327562,
-                            ],
-                            [
-                                -0.0861300651163632,
-                                0.30291348404866386,
-                                -0.6268565873680123,
-                                -0.27469204305759976,
-                            ],
-                        ],
-                    ]
-                ]
-            ]
-        ),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    m = flow.nn.AdaptiveAvgPool3d((1, 2, 3))
-    of_out = m(input)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = np.array(
-        [
-            [
-                [
-                    [
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                    ],
-                    [
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                    ],
-                    [
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                    ],
-                    [
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                        [0.0625, 0.125, 0.125, 0.0625],
-                    ],
-                ]
-            ]
-        ]
-    )
-    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-05, 1e-05))
+NoneType = type(None)
+# Not the same as those in PyTorch because 'output_size' cannot be NoneType (even in 'torch.nn.AdaptiveAvgPoolXd')
+_size_2_opt_t_not_none = Union[int, Tuple[Union[int, NoneType], Union[int, NoneType]]]
+_size_3_opt_t_not_none = Union[
+    int, Tuple[Union[int, NoneType], Union[int, NoneType], Union[int, NoneType]]
+]
 
 
 @flow.unittest.skip_unless_1n1d()
 class TestAdaptiveAvgPool(flow.unittest.TestCase):
+    @autotest()
     def test_adaptive_avgpool1d(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [
-            _test_adaptive_avgpool1d_forward,
-            _test_adaptive_avgpool1d_backward,
-        ]
-        arg_dict["device"] = ["cpu", "cuda"]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
-
+        m = torch.nn.AdaptiveAvgPool1d(output_size=random().to(_size_1_t))
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor(ndim=3).to(device)
+        y = m(x)
+        return y
+
+    @autotest()
     def test_adaptive_avgpool2d(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [
-            _test_adaptive_avgpool2d_forward,
-            _test_adaptive_avgpool2d_backward,
-            _test_adaptive_avgpool2d_hw_forward,
-            _test_adaptive_avgpool2d_hw_backward,
-        ]
-        arg_dict["device"] = ["cpu", "cuda"]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
-
+        m = torch.nn.AdaptiveAvgPool2d(output_size=random().to(_size_2_opt_t_not_none))
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor(ndim=4).to(device)
+        y = m(x)
+        return y
+
+    @unittest.skipIf(
+        version.parse(torch_original.__version__) < version.parse("1.10.0"),
+        "GPU version 'nn.AdaptiveAvgPool3d' has a bug in PyTorch before '1.10.0'",
+    )
+    @autotest()
     def test_adaptive_avgpool3d(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [
-            _test_adaptive_avgpool3d_forward,
-            _test_adaptive_avgpool3d_backward,
-            _test_adaptive_avgpool3d_dhw_forward,
-            _test_adaptive_avgpool3d_dhw_backward,
-        ]
-        arg_dict["device"] = ["cpu", "cuda"]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
+        m = torch.nn.AdaptiveAvgPool3d(output_size=random().to(_size_3_opt_t_not_none))
+        m.train(random())
+        device = random_device()
+        m.to(device)
+        x = random_pytorch_tensor(ndim=5).to(device)
+        y = m(x)
+        return y
 
 
 if __name__ == "__main__":
diff --git a/python/oneflow/test/modules/test_autograd_mode.py b/python/oneflow/test/modules/test_autograd_mode.py
new file mode 100644
index 00000000000..76ba59f8c14
--- /dev/null
+++ b/python/oneflow/test/modules/test_autograd_mode.py
@@ -0,0 +1,76 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import unittest
+import oneflow as flow
+
+import oneflow.unittest
+
+
+class TestAutogradMode(oneflow.unittest.TestCase):
+    def test_grad_mode(test_case):
+        test_case.assertTrue(flow.is_grad_enabled())
+
+    def test_inference_mode(test_case):
+        with flow.inference_mode(True):
+            test_case.assertFalse(flow.is_grad_enabled())
+        test_case.assertTrue(flow.is_grad_enabled())
+
+        @flow.inference_mode(True)
+        def func():
+            test_case.assertFalse(flow.is_grad_enabled())
+
+        func()
+        test_case.assertTrue(flow.is_grad_enabled())
+
+        with flow.inference_mode(False):
+            test_case.assertTrue(flow.is_grad_enabled())
+        test_case.assertTrue(flow.is_grad_enabled())
+
+        @flow.inference_mode(False)
+        def func():
+            test_case.assertTrue(flow.is_grad_enabled())
+
+        func()
+        test_case.assertTrue(flow.is_grad_enabled())
+
+    def test_grad_enable(test_case):
+        with flow.grad_enable():
+            test_case.assertTrue(flow.is_grad_enabled())
+        test_case.assertTrue(flow.is_grad_enabled())
+
+        @flow.grad_enable()
+        def func():
+            test_case.assertTrue(flow.is_grad_enabled())
+
+        func()
+        test_case.assertTrue(flow.is_grad_enabled())
+
+    def test_no_grad(test_case):
+        with flow.no_grad():
+            test_case.assertFalse(flow.is_grad_enabled())
+        test_case.assertTrue(flow.is_grad_enabled())
+
+        @flow.no_grad()
+        def func():
+            test_case.assertFalse(flow.is_grad_enabled())
+
+        func()
+        test_case.assertTrue(flow.is_grad_enabled())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_avgpool.py b/python/oneflow/test/modules/test_avgpool.py
index 61f7b0eb023..17eab8dcdb6 100644
--- a/python/oneflow/test/modules/test_avgpool.py
+++ b/python/oneflow/test/modules/test_avgpool.py
@@ -13,13 +13,7 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-
-import math
 import unittest
-from collections import OrderedDict
-
-import numpy as np
-from test_util import GenArgList
 
 import oneflow as flow
 import oneflow.unittest
@@ -28,7 +22,7 @@
 
 @flow.unittest.skip_unless_1n1d()
 class TestAvgPoolingModule(flow.unittest.TestCase):
-    @autotest(n=20)
+    @autotest(n=100)
     def test_avgpool1d_with_random_data(test_case):
         m = torch.nn.AvgPool1d(
             kernel_size=random(4, 6),
@@ -44,7 +38,7 @@ def test_avgpool1d_with_random_data(test_case):
         y = m(x)
         return y
 
-    @autotest(n=20)
+    @autotest(n=100)
     def test_avgpool2d_with_random_data(test_case):
         m = torch.nn.AvgPool2d(
             kernel_size=random(4, 6),
@@ -63,17 +57,16 @@ def test_avgpool2d_with_random_data(test_case):
         y = m(x)
         return y
 
-    @autotest(n=20)
+    @autotest(n=100)
     def test_avgpool3d_with_random_data(test_case):
         m = torch.nn.AvgPool3d(
-            kernel_size=constant(3),
-            stride=constant(1),
-            padding=constant(1),
+            kernel_size=random(4, 6),
+            stride=random(1, 3) | nothing(),
+            padding=random(1, 3) | nothing(),
             ceil_mode=random(),
             count_include_pad=random(),
             divisor_override=random().to(int),
         )
-
         m.train(random())
         device = random_device()
         m.to(device)
diff --git a/python/oneflow/test/modules/test_consistent_cast.py b/python/oneflow/test/modules/test_consistent_cast.py
index 080e93b783a..ff07ab3e360 100644
--- a/python/oneflow/test/modules/test_consistent_cast.py
+++ b/python/oneflow/test/modules/test_consistent_cast.py
@@ -73,6 +73,21 @@ def test_local_to_consistent_broadcast_data(test_case):
             np.array_equal(z.numpy(), np.ones((16, 16), dtype=np.int32))
         )
 
+    def test_cuda_consistent_to_consistent_cpu_s2b(test_case):
+        x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32)
+        placement = flow.placement("cpu", {0: range(2)})
+        y = x.to_consistent(placement=placement, sbp=flow.sbp.split(0))
+        sbp = (flow.sbp.broadcast,)
+        y = y.to_consistent(sbp=sbp)
+        test_case.assertEqual(y.sbp, sbp)
+        test_case.assertEqual(y.placement, placement)
+        test_case.assertEqual(tuple(y.shape), (32, 16))
+        test_case.assertEqual(y.dtype, flow.int32)
+        z = y.to_local()
+        test_case.assertTrue(
+            np.array_equal(z.numpy(), np.ones((32, 16), dtype=np.int32))
+        )
+
     def test_cuda_consistent_to_consistent_s2b(test_case):
         x = flow.ones((16, 16), device=flow.device("cuda"), dtype=flow.int32)
         placement = flow.placement("cuda", {0: range(2)})
@@ -88,6 +103,26 @@ def test_cuda_consistent_to_consistent_s2b(test_case):
             np.array_equal(z.numpy(), np.ones((32, 16), dtype=np.int32))
         )
 
+    def test_cuda_consistent_to_consistent_cpu_s2p(test_case):
+        x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32)
+        placement = flow.placement("cpu", {0: range(2)})
+        y = x.to_consistent(placement=placement, sbp=flow.sbp.split(0))
+        sbp = (flow.sbp.partial_sum,)
+        y = y.to_consistent(sbp=sbp)
+        test_case.assertEqual(y.sbp, sbp)
+        test_case.assertEqual(y.placement, placement)
+        test_case.assertEqual(tuple(y.shape), (32, 16))
+        test_case.assertEqual(y.dtype, flow.int32)
+        z = y.to_local()
+        if int(os.getenv("RANK")) == 0:
+            test_case.assertTrue(
+                np.array_equal(z.numpy(), np.ones((32, 16), dtype=np.int32))
+            )
+        else:
+            test_case.assertTrue(
+                np.array_equal(z.numpy(), np.zeros((32, 16), dtype=np.int32))
+            )
+
     def test_cuda_consistent_to_consistent_s2p(test_case):
         x = flow.ones((16, 16), device=flow.device("cuda"), dtype=flow.int32)
         placement = flow.placement("cuda", {0: range(2)})
@@ -143,6 +178,21 @@ def test_cuda_consistent_to_consistent_b2s(test_case):
             np.array_equal(z.numpy(), np.ones((8, 16), dtype=np.int32))
         )
 
+    def test_cuda_consistent_to_consistent_cpu_p2s(test_case):
+        x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32)
+        placement = flow.placement("cpu", {0: range(2)})
+        y = x.to_consistent(placement=placement, sbp=flow.sbp.partial_sum)
+        sbp = (flow.sbp.split(0),)
+        y = y.to_consistent(sbp=sbp)
+        test_case.assertEqual(y.sbp, sbp)
+        test_case.assertEqual(y.placement, placement)
+        test_case.assertEqual(tuple(y.shape), (16, 16))
+        test_case.assertEqual(y.dtype, flow.int32)
+        z = y.to_local()
+        test_case.assertTrue(
+            np.array_equal(z.numpy(), np.ones((8, 16), dtype=np.int32) * 2)
+        )
+
     def test_cuda_consistent_to_consistent_p2s(test_case):
         x = flow.ones((16, 16), device=flow.device("cuda"), dtype=flow.int32)
         placement = flow.placement("cuda", {0: range(2)})
@@ -158,6 +208,41 @@ def test_cuda_consistent_to_consistent_p2s(test_case):
             np.array_equal(z.numpy(), np.ones((8, 16), dtype=np.int32) * 2)
         )
 
+    def test_cuda_consistent_to_consistent_cuda_h2d(test_case):
+        x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32)
+        placement = flow.placement("cpu", {0: range(2)})
+        cuda_placement = flow.placement("cuda", {0: range(2)})
+        y = x.to_consistent(placement=placement, sbp=flow.sbp.partial_sum)
+        y = y.to_consistent(placement=cuda_placement, sbp=flow.sbp.partial_sum)
+        test_case.assertEqual(y.sbp, (flow.sbp.partial_sum,))
+        test_case.assertEqual(y.placement, cuda_placement)
+        test_case.assertEqual(tuple(y.shape), (16, 16))
+        test_case.assertEqual(y.dtype, flow.int32)
+        z = y.to_local()
+        test_case.assertTrue(
+            np.array_equal(z.numpy(), np.ones((16, 16), dtype=np.int32))
+        )
+
+    def test_cuda_consistent_to_consistent_cpu_p2b(test_case):
+        x = flow.ones((16, 16), device=flow.device("cpu"), dtype=flow.int32)
+        placement = flow.placement("cpu", {0: range(2)})
+        cuda_placement = flow.placement("cuda", {0: range(2)})
+        y = x.to_consistent(placement=placement, sbp=flow.sbp.partial_sum)
+        import time
+
+        y = y.to_consistent(placement=cuda_placement, sbp=flow.sbp.partial_sum)
+        sbp = (flow.sbp.broadcast,)
+        y = y.to_consistent(placement=cuda_placement, sbp=sbp)
+        y = y.to_consistent(placement=placement, sbp=sbp)
+        test_case.assertEqual(y.sbp, sbp)
+        test_case.assertEqual(y.placement, placement)
+        test_case.assertEqual(tuple(y.shape), (16, 16))
+        test_case.assertEqual(y.dtype, flow.int32)
+        z = y.to_local()
+        test_case.assertTrue(
+            np.array_equal(z.numpy(), np.ones((16, 16), dtype=np.int32) * 2)
+        )
+
     def test_cuda_consistent_to_consistent_p2b(test_case):
         x = flow.ones((16, 16), device=flow.device("cuda"), dtype=flow.int32)
         placement = flow.placement("cuda", {0: range(2)})
diff --git a/python/oneflow/test/modules/test_ddp.py b/python/oneflow/test/modules/test_ddp.py
index 06ab7e25f6f..3bcc3c90334 100644
--- a/python/oneflow/test/modules/test_ddp.py
+++ b/python/oneflow/test/modules/test_ddp.py
@@ -129,6 +129,49 @@ def forward(self, x):
         test_case.assertTrue(np_allclose_with_shape(m.w2.grad.numpy(), np.array([4.5])))
         test_case.assertTrue(np_allclose_with_shape(m.w3.grad.numpy(), np.array([3])))
 
+    def test_broadcast_buffer(test_case):
+        rank = flow.framework.distribute.get_rank()
+
+        class CustomModule(flow.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", flow.tensor([1, 2]) * (rank + 1))
+
+            def forward(self, x):
+                res = self.buf + x
+                self.buf.copy_(x)
+                return res
+
+        x = flow.tensor([2, 3]) * (rank + 1)
+        x = x.to("cuda")
+
+        m = CustomModule()
+        m = m.to("cuda")
+        m = ddp(m)
+
+        y1 = m(x)
+        y2 = m(x)
+
+        m = CustomModule()
+        m = m.to("cuda")
+        m = ddp(m, broadcast_buffers=False)
+
+        y3 = m(x)
+        y4 = m(x)
+
+        if rank == 0:
+            test_case.assertTrue(np_allclose_with_shape(y1.numpy(), np.array([3, 5])))
+            test_case.assertTrue(np_allclose_with_shape(y2.numpy(), np.array([4, 6])))
+            test_case.assertTrue(np_allclose_with_shape(y3.numpy(), np.array([3, 5])))
+            test_case.assertTrue(np_allclose_with_shape(y4.numpy(), np.array([4, 6])))
+        elif rank == 1:
+            test_case.assertTrue(np_allclose_with_shape(y1.numpy(), np.array([5, 8])))
+            test_case.assertTrue(np_allclose_with_shape(y2.numpy(), np.array([6, 9])))
+            test_case.assertTrue(np_allclose_with_shape(y3.numpy(), np.array([6, 10])))
+            test_case.assertTrue(np_allclose_with_shape(y4.numpy(), np.array([8, 12])))
+        else:
+            raise ValueError()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_matmul.py b/python/oneflow/test/modules/test_matmul.py
index 8b0fb5a471d..b10d90d1274 100644
--- a/python/oneflow/test/modules/test_matmul.py
+++ b/python/oneflow/test/modules/test_matmul.py
@@ -13,330 +13,15 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-
 import unittest
-from collections import OrderedDict
-
-import numpy as np
-import torch
-from automated_test_util import *
-from test_util import GenArgList
 
 import oneflow as flow
 import oneflow.unittest
-
-
-def _test_matmul(test_case, device):
-    input1 = flow.Tensor(
-        np.random.randn(2, 6), dtype=flow.float32, device=flow.device(device)
-    )
-    input2 = flow.Tensor(
-        np.random.randn(6, 5), dtype=flow.float32, device=flow.device(device)
-    )
-    of_out = flow.matmul(input1, input2)
-    np_out = np.matmul(input1.numpy(), input2.numpy())
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
-
-
-def _test_broadcast_matmul(test_case, device):
-    input1 = flow.Tensor(
-        np.random.randn(3, 4, 5), dtype=flow.float32, device=flow.device(device)
-    )
-    input2 = flow.Tensor(
-        np.random.randn(5, 6), dtype=flow.float32, device=flow.device(device)
-    )
-    of_out = flow.matmul(input1, input2)
-    np_out = np.matmul(input1.numpy(), input2.numpy())
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
-
-
-def _test_batch_matmul(test_case, device):
-    input1 = flow.Tensor(
-        np.random.randn(10, 3, 4), dtype=flow.float32, device=flow.device(device)
-    )
-    input2 = flow.Tensor(
-        np.random.randn(10, 4, 5), dtype=flow.float32, device=flow.device(device)
-    )
-    of_out = flow.matmul(input1, input2)
-    np_out = np.matmul(input1.numpy(), input2.numpy())
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05))
-
-
-def _test_matmul_backward(test_case, device):
-    input1 = flow.Tensor(
-        [
-            [
-                -0.36023932695388794,
-                0.5571867227554321,
-                -1.4987696409225464,
-                -0.9674592018127441,
-                0.021076146513223648,
-                2.9180469512939453,
-            ],
-            [
-                -0.29169487953186035,
-                0.2978641390800476,
-                0.8198832273483276,
-                -0.3385652005672455,
-                -2.9260432720184326,
-                0.22528153657913208,
-            ],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    input2 = flow.Tensor(
-        [
-            [
-                -0.5270200371742249,
-                -0.4325239062309265,
-                -0.33396217226982117,
-                1.2983192205429077,
-                -0.463693231344223,
-            ],
-            [
-                1.893467903137207,
-                -1.0874812602996826,
-                0.7068315744400024,
-                -0.23532593250274658,
-                -0.011510828509926796,
-            ],
-            [
-                -0.5477776527404785,
-                -0.0381619855761528,
-                0.03451986983418465,
-                -0.8248650431632996,
-                -1.8885509967803955,
-            ],
-            [
-                -1.0034432411193848,
-                0.5428839921951294,
-                -0.7785694599151611,
-                -0.4489346146583557,
-                1.780846118927002,
-            ],
-            [
-                0.9378347396850586,
-                -0.38816362619400024,
-                0.8186876177787781,
-                -0.9630932807922363,
-                -0.11487948149442673,
-            ],
-            [
-                -0.12073716521263123,
-                2.181835174560547,
-                0.5511962175369263,
-                -1.294308066368103,
-                -0.7765272855758667,
-            ],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    of_out = flow.matmul(input1, input2)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = [
-        [
-            -0.45888009667396545,
-            1.2659813165664673,
-            -3.264835834503174,
-            0.09278273582458496,
-            0.2903860807418823,
-            0.5414588451385498,
-        ],
-        [
-            -0.45888009667396545,
-            1.2659813165664673,
-            -3.264835834503174,
-            0.09278273582458496,
-            0.2903860807418823,
-            0.5414588451385498,
-        ],
-    ]
-    test_case.assertTrue(
-        np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
-    )
-
-
-def _test_matmul_backward_x_grad(test_case, device):
-    input1 = flow.Tensor(
-        [
-            [-1.8604081869125366, -2.0019688606262207],
-            [1.0511547327041626, -2.263841390609741],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    input2 = flow.Tensor(
-        [
-            [-0.13973912596702576, 0.8478717803955078],
-            [-0.2144828885793686, -1.7145386934280396],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=False,
-    )
-    of_out = flow.matmul(input1, input2)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = [
-        [0.7081326246261597, -1.9290215969085693],
-        [0.7081326246261597, -1.9290215969085693],
-    ]
-    test_case.assertTrue(
-        np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
-    )
-
-
-def _test_matmul_backward_y_grad(test_case, device):
-    input1 = flow.Tensor(
-        [
-            [-1.8604081869125366, -2.0019688606262207],
-            [1.0511547327041626, -2.263841390609741],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=False,
-    )
-    input2 = flow.Tensor(
-        [
-            [-0.13973912596702576, 0.8478717803955078],
-            [-0.2144828885793686, -1.7145386934280396],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    of_out = flow.matmul(input1, input2)
-    of_out = of_out.sum()
-    of_out.backward()
-    print(input2.grad.numpy().tolist())
-    np_grad = [
-        [-0.809253454208374, -0.809253454208374],
-        [-4.265810012817383, -4.265810012817383],
-    ]
-    test_case.assertTrue(
-        np.allclose(input2.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
-    )
-
-
-def _test_broadcast_matmul_backward(test_case, device):
-    input1 = flow.Tensor(
-        [
-            [
-                [0.5893293023109436, -0.0376124233007431, 0.7791574001312256],
-                [1.1614371538162231, 0.009700910188257694, 0.7281601428985596],
-            ],
-            [
-                [-0.27213698625564575, 0.7058051824569702, -0.4643424451351166],
-                [2.2279646396636963, 0.05870082601904869, -0.18335142731666565],
-            ],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    input2 = flow.Tensor(
-        [
-            [0.25825661420822144, -0.4875393807888031],
-            [-0.040459781885147095, -0.3713535666465759],
-            [-1.633512258529663, -2.0034799575805664],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    of_out = flow.matmul(input1, input2)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = [
-        [
-            [-0.22928276658058167, -0.411813348531723, -3.6369922161102295],
-            [-0.22928276658058167, -0.411813348531723, -3.6369922161102295],
-        ],
-        [
-            [-0.22928276658058167, -0.411813348531723, -3.6369922161102295],
-            [-0.22928276658058167, -0.411813348531723, -3.6369922161102295],
-        ],
-    ]
-    test_case.assertTrue(
-        np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
-    )
-
-
-def _test_batch_matmul_backward(test_case, device):
-    input1 = flow.Tensor(
-        [
-            [
-                [-0.0036776792258024216, 1.9946473836898804, -0.423959881067276],
-                [1.0892143249511719, 0.04005361348390579, -0.27883127331733704],
-            ],
-            [
-                [-0.970306396484375, 0.017771577462553978, 0.019596196711063385],
-                [0.27402883768081665, -0.8192587494850159, -0.3135920464992523],
-            ],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    input2 = flow.Tensor(
-        [
-            [
-                [1.118346929550171, -0.930071234703064],
-                [1.1238232851028442, 1.373764157295227],
-                [0.17178462445735931, -1.1010534763336182],
-            ],
-            [
-                [0.6694859862327576, 0.9250285029411316],
-                [-1.0835869312286377, 0.4192655086517334],
-                [1.2616937160491943, 0.33809131383895874],
-            ],
-        ],
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    of_out = flow.matmul(input1, input2)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = [
-        [
-            [0.18827569484710693, 2.4975874423980713, -0.9292688369750977],
-            [0.18827569484710693, 2.4975874423980713, -0.9292688369750977],
-        ],
-        [
-            [1.5945144891738892, -0.6643214225769043, 1.5997850894927979],
-            [1.5945144891738892, -0.6643214225769043, 1.5997850894927979],
-        ],
-    ]
-    test_case.assertTrue(
-        np.allclose(input1.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
-    )
+from automated_test_util import *
 
 
 @flow.unittest.skip_unless_1n1d()
 class TestModule(flow.unittest.TestCase):
-    def test_matmul(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [
-            _test_matmul,
-            _test_broadcast_matmul,
-            _test_batch_matmul,
-            _test_matmul_backward,
-            _test_matmul_backward_x_grad,
-            _test_matmul_backward_y_grad,
-            _test_batch_matmul_backward,
-            _test_broadcast_matmul_backward,
-        ]
-        arg_dict["device"] = ["cpu", "cuda"]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
-
     @autotest()
     def test_flow_matmul_with_random_data(test_case):
         k = random(1, 6)
@@ -345,6 +30,13 @@ def test_flow_matmul_with_random_data(test_case):
         z = torch.matmul(x, y)
         return z
 
+    @autotest()
+    def test_flow_tensor_matmul_with_random_data(test_case):
+        k = random(1, 6)
+        x = random_pytorch_tensor(ndim=2, dim1=k)
+        y = random_pytorch_tensor(ndim=2, dim0=k)
+        return x.matmul(y)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_negative.py b/python/oneflow/test/modules/test_negative.py
index 4534545a224..7352a29cb72 100644
--- a/python/oneflow/test/modules/test_negative.py
+++ b/python/oneflow/test/modules/test_negative.py
@@ -15,69 +15,14 @@
 """
 
 import unittest
-from collections import OrderedDict
-
-import numpy as np
-from test_util import GenArgList
 
 import oneflow as flow
 import oneflow.unittest
 from automated_test_util import *
 
 
-def _test_negtive(test_case, shape, device):
-    np_input = np.random.randn(*shape)
-    input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
-    of_out = flow.negative(input)
-    np_out = -input.numpy()
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_negative_neg(test_case, shape, device):
-    np_input = np.random.randn(*shape)
-    input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
-    of_out = flow.neg(input)
-    np_out = -input.numpy()
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_tensor_negative(test_case, shape, device):
-    np_input = np.random.randn(*shape)
-    input = flow.Tensor(np_input, dtype=flow.float32, device=flow.device(device))
-    of_out = input.negative()
-    np_out = -input.numpy()
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_negative_backward(test_case, shape, device):
-    np_input = np.random.randn(*shape)
-    input = flow.Tensor(
-        np_input, dtype=flow.float32, device=flow.device(device), requires_grad=True
-    )
-    of_out = flow.negative(input)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = -np.ones(shape)
-    test_case.assertTrue(
-        np.allclose(input.grad.numpy(), np_grad, atol=1e-05, rtol=1e-05)
-    )
-
-
 @flow.unittest.skip_unless_1n1d()
 class TestNegativeModule(flow.unittest.TestCase):
-    def test_negative(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [
-            _test_negtive,
-            _test_negative_neg,
-            _test_tensor_negative,
-            _test_negative_backward,
-        ]
-        arg_dict["shape"] = [(2, 3), (2, 4, 5, 6)]
-        arg_dict["device"] = ["cpu", "cuda"]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
-
     @autotest(auto_backward=False)
     def test_ne_with_0shape_data(test_case):
         device = random_device()
@@ -87,6 +32,23 @@ def test_ne_with_0shape_data(test_case):
         y3 = -x
         return (y1, y2, y3)
 
+    @autotest()
+    def test_tensor_negative_with_random_data(test_case):
+        x = random_pytorch_tensor().to(random_device())
+        return x.negative()
+
+    @autotest()
+    def test_negative_with_random_data(test_case):
+        x = random_pytorch_tensor().to(random_device())
+        z = torch.negative(x)
+        return z
+
+    @autotest()
+    def test_neg_with_random_data(test_case):
+        x = random_pytorch_tensor().to(random_device())
+        z = torch.neg(x)
+        return z
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_pooling.py b/python/oneflow/test/modules/test_pooling.py
index e5e36c93863..4953d5477b5 100644
--- a/python/oneflow/test/modules/test_pooling.py
+++ b/python/oneflow/test/modules/test_pooling.py
@@ -28,14 +28,16 @@ def unpack_indices(dual_object):
 
 @flow.unittest.skip_unless_1n1d()
 class TestMaxPooling(flow.unittest.TestCase):
-    @autotest(n=20, auto_backward=True, rtol=1e-4, atol=1e-4)
+    @autotest(n=100, auto_backward=False)
     def test_maxpool1d_with_random_data(test_case):
+        return_indices = random().to(bool).value()
         m = torch.nn.MaxPool1d(
             kernel_size=random(4, 6).to(_size_1_t),
             stride=random(1, 3).to(_size_1_t) | nothing(),
             padding=random(1, 3).to(_size_1_t) | nothing(),
             dilation=random(2, 4).to(_size_1_t) | nothing(),
-            ceil_mode=random() | nothing(),
+            ceil_mode=random(),
+            return_indices=return_indices,
         )
         m.train(random())
         device = random_device()
@@ -43,16 +45,21 @@ def test_maxpool1d_with_random_data(test_case):
         x = random_pytorch_tensor(ndim=3, dim2=random(20, 22)).to(device)
         y = m(x)
 
-        return y
+        if return_indices:
+            return unpack_indices(y)
+        else:
+            return y, y.sum().backward()
 
-    @autotest(n=20, auto_backward=True, rtol=1e-4, atol=1e-4)
+    @autotest(n=100, auto_backward=False)
     def test_maxpool2d_with_random_data(test_case):
+        return_indices = random().to(bool).value()
         m = torch.nn.MaxPool2d(
             kernel_size=random(4, 6).to(_size_2_t),
             stride=random(1, 3).to(_size_2_t) | nothing(),
             padding=random(1, 3).to(_size_2_t) | nothing(),
             dilation=random(2, 4).to(_size_2_t) | nothing(),
-            ceil_mode=random() | nothing(),
+            ceil_mode=random(),
+            return_indices=return_indices,
         )
         m.train(random())
         device = random_device()
@@ -62,16 +69,21 @@ def test_maxpool2d_with_random_data(test_case):
         )
         y = m(x)
 
-        return y
+        if return_indices:
+            return unpack_indices(y)
+        else:
+            return y, y.sum().backward()
 
-    @autotest(n=20, auto_backward=True, rtol=1e-4, atol=1e-4)
+    @autotest(n=100, auto_backward=False)
     def test_maxpool3d_with_random_data(test_case):
+        return_indices = random().to(bool).value()
         m = torch.nn.MaxPool3d(
             kernel_size=random(4, 6).to(_size_3_t),
             stride=random(1, 3).to(_size_3_t) | nothing(),
             padding=random(1, 3).to(_size_3_t) | nothing(),
             dilation=random(2, 4).to(_size_3_t) | nothing(),
-            ceil_mode=random() | nothing(),
+            ceil_mode=random(),
+            return_indices=return_indices,
         )
         m.train(random())
         device = random_device()
@@ -81,7 +93,10 @@ def test_maxpool3d_with_random_data(test_case):
         ).to(device)
         y = m(x)
 
-        return y
+        if return_indices:
+            return unpack_indices(y)
+        else:
+            return y, y.sum().backward()
 
 
 if __name__ == "__main__":
diff --git a/python/oneflow/test/modules/test_randint.py b/python/oneflow/test/modules/test_randint.py
index 7a668417dab..122cd9ad097 100644
--- a/python/oneflow/test/modules/test_randint.py
+++ b/python/oneflow/test/modules/test_randint.py
@@ -22,7 +22,6 @@
 
 import oneflow.unittest
 from test_util import GenArgList
-from automated_test_util import *
 
 
 def _test_rand(test_case, device, shape, low, high):
@@ -80,16 +79,14 @@ def _test_high(test_case, device, shape, low, high):
 
 def _test_0rank(test_case, device, shape, low, high):
     y1 = flow.randint(low, high, shape, device=flow.device(device))
-    y2 = flow.randint(low, high, shape, device=flow.device(device))
-    test_case.assertTrue(not np.allclose(y1.numpy(), y2.numpy(), atol=1e-4, rtol=1e-4))
-
+    test_case.assertTrue(y1.shape == shape)
 
 @flow.unittest.skip_unless_1n1d()
 class TestRandint(flow.unittest.TestCase):
     def test_consistent_naive(test_case):
         placement = flow.placement("cpu", {0: [0]})
         sbp = (flow.sbp.broadcast,)
-        x = flow.randint(16, (10, 1), placement=placement, sbp=sbp)
+        x = flow.randint(0, 16, (10, 1), placement=placement, sbp=sbp)
         test_case.assertEqual(x.sbp, sbp)
         test_case.assertEqual(x.placement, placement)
 
diff --git a/python/oneflow/test/modules/test_repeat.py b/python/oneflow/test/modules/test_repeat.py
index 1346b4444ca..042c58c9815 100644
--- a/python/oneflow/test/modules/test_repeat.py
+++ b/python/oneflow/test/modules/test_repeat.py
@@ -13,177 +13,21 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-
 import unittest
-from collections import OrderedDict
-
-import numpy as np
-from test_util import GenArgList
 
 import oneflow as flow
 import oneflow.unittest
-
-
-def np_repeat(x, sizes):
-    return np.tile(x, sizes)
-
-
-def _test_repeat_new_dim(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(2, 4, 1, 3), dtype=flow.float32, device=flow.device(device)
-    )
-    sizes = (4, 3, 2, 3, 3)
-    np_out = np_repeat(input.numpy(), sizes)
-    of_out = input.repeat(4, 3, 2, 3, 3)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_repeat_input_list_new_dim(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(2, 4, 1, 3), dtype=flow.float32, device=flow.device(device)
-    )
-    sizes = (4, 3, 2, 3, 3)
-    np_out = np_repeat(input.numpy(), sizes)
-    of_out = input.repeat(sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_repeat_same_dim(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(1, 2, 5, 3), dtype=flow.float32, device=flow.device(device)
-    )
-    sizes = (4, 2, 3, 19)
-    of_out = input.repeat(4, 2, 3, 19)
-    np_out = np_repeat(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_repeat_same_dim_int(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(1, 2, 5, 3), dtype=flow.int32, device=flow.device(device)
-    )
-    size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19))
-    sizes = size_tensor.size()
-    of_out = input.repeat(size_tensor.size())
-    np_out = np_repeat(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32)))
-
-
-def _test_repeat_same_dim_int8(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(1, 2, 5, 3), dtype=flow.int8, device=flow.device(device)
-    )
-    size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19))
-    sizes = size_tensor.size()
-    of_out = input.repeat(sizes)
-    np_out = np_repeat(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32)))
-
-
-def _test_repeat_new_dim_backward(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(2, 4, 1, 3),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    sizes = (4, 3, 2, 3, 3)
-    of_out = input.repeat(4, 3, 2, 3, 3)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = [
-        [
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-        ],
-        [
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-        ],
-    ]
-    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
-
-
-def _test_repeat_same_dim_backward(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(1, 2, 5, 3),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    of_out = input.repeat(1, 2, 3, 1)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = [
-        [
-            [
-                [6.0, 6.0, 6.0],
-                [6.0, 6.0, 6.0],
-                [6.0, 6.0, 6.0],
-                [6.0, 6.0, 6.0],
-                [6.0, 6.0, 6.0],
-            ],
-            [
-                [6.0, 6.0, 6.0],
-                [6.0, 6.0, 6.0],
-                [6.0, 6.0, 6.0],
-                [6.0, 6.0, 6.0],
-                [6.0, 6.0, 6.0],
-            ],
-        ]
-    ]
-    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
-
-
-def _test_repeat_flow_size(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(2, 4, 1, 3),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    sizes = flow.Size([4, 3, 2, 3, 3])
-    of_out = input.repeat(sizes)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = [
-        [
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-        ],
-        [
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-            [[216.0, 216.0, 216.0]],
-        ],
-    ]
-    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+from automated_test_util import *
 
 
 @flow.unittest.skip_unless_1n1d()
 class TestRepeat(flow.unittest.TestCase):
-    def test_repeat(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [
-            _test_repeat_new_dim,
-            _test_repeat_same_dim,
-            _test_repeat_same_dim_int,
-            _test_repeat_same_dim_int8,
-            _test_repeat_new_dim_backward,
-            _test_repeat_same_dim_backward,
-            _test_repeat_flow_size,
-            _test_repeat_input_list_new_dim,
-        ]
-        arg_dict["device"] = ["cpu", "cuda"]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
+    @autotest()
+    def test_flow_tensor_repeat_with_random_data(test_case):
+        x = random_pytorch_tensor(ndim=2, dim0=1, dim1=2)
+        sizes = (random(1, 5).to(int), random(1, 5).to(int), random(1, 5).to(int))
+        y = x.repeat(sizes)
+        return y
 
 
 if __name__ == "__main__":
diff --git a/python/oneflow/test/modules/test_tensor_str.py b/python/oneflow/test/modules/test_tensor_str.py
index 5c88acb2f2a..ad409a1f228 100644
--- a/python/oneflow/test/modules/test_tensor_str.py
+++ b/python/oneflow/test/modules/test_tensor_str.py
@@ -14,6 +14,7 @@
 limitations under the License.
 """
 
+import os
 import unittest
 from collections import OrderedDict
 
@@ -26,7 +27,6 @@
 import oneflow
 
 
-@flow.unittest.skip_unless_1n1d()
 def _test_local_tensor_str(test_case, device):
     # int dtype
     x = flow.tensor([[1, 2, 3], [4, 5, -6]], device=flow.device(device))
@@ -92,7 +92,6 @@ def _test_local_tensor_str(test_case, device):
     test_case.assertTrue("..." in tensor_str)
 
 
-@flow.unittest.skip_unless_1n1d()
 def _test_consistent_tensor_str(test_case, device):
     placement = flow.placement(device, {0: range(1)})
     # split consistent tensor
@@ -109,7 +108,6 @@ def _test_consistent_tensor_str(test_case, device):
     x = flow.ones((10, 10), placement=placement, sbp=[flow.sbp.partial_sum])
     tensor_str = str(x)
     test_case.assertTrue("1." in tensor_str)
-    test_case.assertTrue("1." in str(x[0][0]))
 
     # summarized consistent tensor
     x = flow.ones((100, 100), placement=placement, sbp=[flow.sbp.split(0)])
@@ -123,7 +121,6 @@ def _test_consistent_tensor_str(test_case, device):
     test_case.assertTrue("[]" in tensor_str)
 
 
-@flow.unittest.skip_unless_1n2d()
 def _test_consistent_tensor_str_2d(test_case, device):
     placement = flow.placement(device, {0: range(2)})
     x = flow.ones((10, 10), placement=placement, sbp=[flow.sbp.split(0)])
@@ -133,6 +130,8 @@ def _test_consistent_tensor_str_2d(test_case, device):
     x = flow.ones((10, 10), placement=placement, sbp=[flow.sbp.broadcast])
     tensor_str = str(x)
     test_case.assertTrue("1." in tensor_str)
+    # TODO: x[0][0].to("cuda") has bug
+    # test_case.assertTrue("1." in str(x[0][0]))
 
     x = flow.ones((10, 10), placement=placement, sbp=[flow.sbp.partial_sum])
     tensor_str = str(x)
@@ -145,14 +144,35 @@ def _test_consistent_tensor_str_2d(test_case, device):
 
 
 class TestTensorStrModule(flow.unittest.TestCase):
-    def test_tensor_str(test_case):
+    @flow.unittest.skip_unless_1n1d()
+    def test_local_tensor_str_1n1d(test_case):
         arg_dict = OrderedDict()
         arg_dict["test_fun"] = [
             _test_local_tensor_str,
+        ]
+        arg_dict["device"] = ["cpu", "cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    @flow.unittest.skip_unless_1n1d()
+    def test_consistent_tensor_str_1n1d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
             _test_consistent_tensor_str,
+        ]
+        arg_dict["device"] = ["cuda"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+    @flow.unittest.skip_unless_1n2d()
+    def test_tensor_str_1n2d(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
             _test_consistent_tensor_str_2d,
         ]
-        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["device"] = ["cuda"]
         for arg in GenArgList(arg_dict):
             arg[0](test_case, *arg[1:])
 
diff --git a/python/oneflow/test/modules/test_tile.py b/python/oneflow/test/modules/test_tile.py
index ae9a94ed84e..e2985d8d426 100644
--- a/python/oneflow/test/modules/test_tile.py
+++ b/python/oneflow/test/modules/test_tile.py
@@ -13,167 +13,28 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-
 import unittest
-from collections import OrderedDict
-
-import numpy as np
-from test_util import GenArgList
 
 import oneflow as flow
 import oneflow.unittest
-
-
-def np_tile(x, sizes):
-    return np.tile(x, sizes)
-
-
-def np_tile_grad(x, sizes):
-    times = np.array(sizes).prod()
-    return np.ones(shape=x.shape) * times
-
-
-def _test_tile_less_dim_a(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(2, 4, 1, 3), dtype=flow.float32, device=flow.device(device)
-    )
-    sizes = (2,)
-    np_out = np_tile(input.numpy(), sizes)
-    of_out = input.tile(reps=sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_tile_less_dim_b(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(3, 2, 5), dtype=flow.float32, device=flow.device(device)
-    )
-    sizes = (3, 4)
-    np_out = np_tile(input.numpy(), sizes)
-    of_out = input.tile(reps=sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_tile_less_dim_c(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(4, 3, 2, 5, 3), dtype=flow.float32, device=flow.device(device)
-    )
-    sizes = (2, 3, 4, 4)
-    np_out = np_tile(input.numpy(), sizes)
-    of_out = input.tile(reps=sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_tile_same_dim(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(1, 2, 5, 3), dtype=flow.float32, device=flow.device(device)
-    )
-    sizes = (4, 2, 3, 19)
-    of_out = input.tile(reps=sizes)
-    np_out = np_tile(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out))
-
-
-def _test_tile_same_dim_int(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(1, 2, 5, 3), dtype=flow.int32, device=flow.device(device)
-    )
-    size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19))
-    sizes = size_tensor.size()
-    of_out = input.tile(reps=sizes)
-    np_out = np_tile(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32)))
-
-
-def _test_tile_same_dim_int8(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(1, 2, 5, 3), dtype=flow.int8, device=flow.device(device)
-    )
-    size_tensor = flow.Tensor(np.random.randn(4, 2, 3, 19))
-    sizes = size_tensor.size()
-    of_out = input.tile(reps=sizes)
-    np_out = np_tile(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(of_out.numpy(), np_out.astype(np.int32)))
-
-
-def _test_tile_less_dim_a_backward(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(2, 4, 1, 3),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    sizes = (2,)
-    of_out = input.tile(reps=sizes)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = np_tile_grad(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
-
-
-def _test_tile_less_dim_b_backward(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(3, 2, 5),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    sizes = (3, 4)
-    of_out = input.tile(reps=sizes)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = np_tile_grad(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
-
-
-def _test_tile_less_dim_c_backward(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(4, 3, 2, 5, 3),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    sizes = (2, 3, 4, 4)
-    of_out = input.tile(reps=sizes)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = np_tile_grad(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
-
-
-def _test_tile_same_dim_backward(test_case, device):
-    input = flow.Tensor(
-        np.random.randn(1, 2, 5, 3),
-        dtype=flow.float32,
-        device=flow.device(device),
-        requires_grad=True,
-    )
-    sizes = (1, 2, 3, 1)
-    of_out = input.tile(reps=sizes)
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = np_tile_grad(input.numpy(), sizes)
-    test_case.assertTrue(np.array_equal(input.grad.numpy(), np_grad))
+from automated_test_util import *
 
 
 @flow.unittest.skip_unless_1n1d()
 class TestTile(flow.unittest.TestCase):
-    def test_tile(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [
-            _test_tile_less_dim_a,
-            _test_tile_less_dim_b,
-            _test_tile_less_dim_c,
-            _test_tile_same_dim,
-            _test_tile_same_dim_int,
-            _test_tile_same_dim_int8,
-            _test_tile_less_dim_a_backward,
-            _test_tile_less_dim_b_backward,
-            _test_tile_less_dim_c_backward,
-            _test_tile_same_dim_backward,
-        ]
-        arg_dict["device"] = ["cpu", "cuda"]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
+    @autotest()
+    def test_flow_tile_with_random_data(test_case):
+        x = random_pytorch_tensor(ndim=2, dim0=1, dim1=2)
+        reps = (random(1, 5).to(int), random(1, 5).to(int), random(1, 5).to(int))
+        z = torch.tile(x, reps)
+        return z
+
+    @autotest()
+    def test_flow_tensor_tile_with_random_data(test_case):
+        x = random_pytorch_tensor(ndim=2, dim0=1, dim1=2)
+        reps = (random(1, 5).to(int), random(1, 5).to(int), random(1, 5).to(int))
+        y = x.tile(reps)
+        return y
 
 
 if __name__ == "__main__":
diff --git a/python/oneflow/test_utils/automated_test_util/generators.py b/python/oneflow/test_utils/automated_test_util/generators.py
index 7970529aa6f..b6889b39088 100644
--- a/python/oneflow/test_utils/automated_test_util/generators.py
+++ b/python/oneflow/test_utils/automated_test_util/generators.py
@@ -26,6 +26,8 @@
 import oneflow as flow
 
 py_tuple = tuple
+NoneType = type(None)
+
 TEST_MODULE = 0
 TEST_FLOW = 1
 TEST_TENSOR = 2
diff --git a/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py
index b8ed340816c..e60b842ee1c 100644
--- a/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py
+++ b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py
@@ -306,9 +306,9 @@ def check_equality(dual_object: DualObject, rtol=0.0001, atol=1e-05):
                 break
     assert checker is not None, (
         "checker not found for type "
-        + type(dual_object.pytorch)
+        + str(type(dual_object.pytorch))
         + " and "
-        + type(dual_object.oneflow)
+        + str(type(dual_object.oneflow))
     )
     return checker(dual_object.pytorch, dual_object.oneflow, rtol, atol)
 
@@ -324,7 +324,7 @@ def check_tensor_equality(torch_tensor, flow_tensor, rtol=0.0001, atol=1e-05):
         flow_grad = flow_tensor.grad.numpy()
         if not np.allclose(torch_grad, flow_grad, rtol=rtol, atol=atol):
             print(
-                "Grads are not equal. PyTorch grad: \n{torch_grad}\n, OneFlow grad: \n{flow_grad}"
+                f"Grads are not equal. PyTorch grad: \n{torch_grad}\n, OneFlow grad: \n{flow_grad}"
             )
             return False
     equality_res = np.allclose(
diff --git a/tools/cfg/template/template.cfg.cpp b/tools/cfg/template/template.cfg.cpp
index 8d5486989d2..79598a3b824 100644
--- a/tools/cfg/template/template.cfg.cpp
+++ b/tools/cfg/template/template.cfg.cpp
@@ -872,6 +872,10 @@ void {{ util.class_name(cls) }}::CopyFrom(const {{ util.class_name(cls) }}& othe
   CopyFrom(other);
   return *this;
 }
+{{ util.class_name(cls) }}& {{ util.class_name(cls) }}::operator=({{ util.class_name(cls) }}&& other) {
+  data_ = std::move(other.data_);
+  return *this;
+}
 
 {% for field in util.message_type_fields(cls) %}
 {% if util.field_has_required_or_optional_label(field) %}
diff --git a/tools/cfg/template/template.cfg.h b/tools/cfg/template/template.cfg.h
index 8c43dadc48f..c9b7590ea1b 100644
--- a/tools/cfg/template/template.cfg.h
+++ b/tools/cfg/template/template.cfg.h
@@ -342,6 +342,7 @@ class {{ util.class_name(cls) }} final : public Const{{ util.class_name(cls) }}
   void Clear();
   void CopyFrom(const {{ util.class_name(cls) }}& other);
   {{ util.class_name(cls) }}& operator=(const {{ util.class_name(cls) }}& other);
+  {{ util.class_name(cls) }}& operator=({{ util.class_name(cls) }}&& other);
 
 {% for field in util.message_type_fields(cls) %}
 {% if util.field_has_required_or_optional_label(field) %}