Oneflow-Inc · oneflow-ci-bot · Jul 7, 2021 · Jun 15, 2021 · Jun 15, 2021 · Jun 15, 2021
diff --git a/oneflow/api/python/framework/tensor.cpp b/oneflow/api/python/framework/tensor.cpp
@@ -247,9 +247,10 @@ void ExportTensor(py::module& m, const char* name) {
       // Methods of pytorch
       .def("retain_grad",
            [](T& t) {
-             if (!t.is_leaf()) { t.set_retain_grad(true); }
+             if (!t.is_leaf()) { t.set_retain_grad(true).GetOrThrow(); }
            })
       .def("detach", [](const T& t) { return t.api_detach().GetPtrOrThrow(); })
+      .def("clone", [](const T& t) { return t.api_clone().GetPtrOrThrow(); })
       // OneFlow tensor properties other than pytorch tensor
       .def_property_readonly("is_lazy", &T::is_lazy)
       .def_property_readonly("is_consistent", &T::is_consistent);

diff --git a/oneflow/core/autograd/autograd_engine.cpp b/oneflow/core/autograd/autograd_engine.cpp
@@ -68,15 +68,16 @@ StackFunctionNode::StackFunctionNode(
   input_meta_datas_.resize(inputs.size());
   next_functions_->reserve(inputs.size());
   for (int i = 0; i < inputs.size(); ++i) {
-    input_meta_datas_.at(i) = inputs.at(i)->mut_autograd_meta();
-    if (input_meta_datas_.at(i)->requires_grad()) {
+    if (inputs.at(i)->requires_grad()) {
+      input_meta_datas_.at(i) = inputs.at(i)->mut_autograd_meta();
       next_functions_->emplace_back(inputs.at(i)->grad_fn_node());
     }
   }
 
   output_meta_datas_.resize(outputs.size());
   output_tensor_infos_.reserve(outputs.size());
   for (int i = 0; i < outputs.size(); ++i) {
+    outputs.at(i)->create_autograd_meta();
     output_meta_datas_.at(i) = outputs.at(i)->mut_autograd_meta();
     output_tensor_infos_.emplace_back(TensorInfo(*outputs.at(i)));
   }
@@ -129,6 +130,7 @@ Maybe<bool> StackFunctionNode::Apply(bool create_graph) {
   JUST((*backward_fn_)(output_grads, &input_grads, create_graph));
   for (int i = 0; i < input_meta_datas_.size(); ++i) {
     if (input_grads.at(i)) {
+      CHECK_NOTNULL_OR_RETURN(input_meta_datas_.at(i));
       JUST(input_meta_datas_.at(i)->now_grad_arg()->PushPartialTensor(input_grads.at(i)));
     }
   }
@@ -151,7 +153,7 @@ Maybe<void> StackAutogradEngine::RunBackwardAndSaveGrads4LeafTensor(const Tensor
                                                                     bool create_graph) {
   ClearReleasedFunctionNodes();
   for (int i = 0; i < outputs.size(); ++i) {
-    JUST(outputs.at(i)->now_grad_arg()->PushPartialTensor(out_grads.at(i)));
+    JUST(JUST(outputs.at(i)->now_grad_arg())->PushPartialTensor(out_grads.at(i)));
   }
   // Runs each FunctionNode
   for (const auto& weak_func_node : node_list_) {
@@ -179,7 +181,7 @@ Maybe<TensorTuple> StackAutogradEngine::RunBackwardAndReturnInputsTensorGrad(
     inputs.at(i)->set_retain_grad(true);
   }
   for (int i = 0; i < outputs.size(); ++i) {
-    JUST(outputs.at(i)->now_grad_arg()->PushPartialTensor(out_grads.at(i)));
+    JUST(JUST(outputs.at(i)->now_grad_arg())->PushPartialTensor(out_grads.at(i)));
   }
   // Runs each FunctionNode
   for (const auto& weak_func_node : node_list_) {
@@ -192,9 +194,9 @@ Maybe<TensorTuple> StackAutogradEngine::RunBackwardAndReturnInputsTensorGrad(
     }
   }
   for (int i = 0; i < inputs.size(); ++i) {
-    input_now_grads->at(i) = inputs.at(i)->acc_grad();
+    input_now_grads->at(i) = JUST(inputs.at(i)->acc_grad());
     if (!ori_retain_grad.at(i)) {
-      inputs.at(i)->mut_acc_grad().reset();
+      JUST(inputs.at(i)->mut_acc_grad()).reset();
       inputs.at(i)->set_retain_grad(false);
     }
   }

diff --git a/oneflow/core/eager/local_call_opkernel_phy_instr_operand.h b/oneflow/core/eager/local_call_opkernel_phy_instr_operand.h
@@ -48,13 +48,19 @@ class LocalCallOpKernelPhyInstrOperand final : public vm::PhyInstrOperand {
 
   LocalCallOpKernelPhyInstrOperand(const std::shared_ptr<one::StatefulLocalOpKernel>& opkernel,
                                    const one::EagerBlobObjectListPtr& inputs,
-                                   const one::EagerBlobObjectListPtr& outputs, const AttrMap& attrs)
-      : opkernel_(opkernel), inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
+                                   const one::EagerBlobObjectListPtr& outputs, const AttrMap& attrs,
+                                   bool is_inplace)
+      : opkernel_(opkernel),
+        inputs_(inputs),
+        outputs_(outputs),
+        attrs_(attrs),
+        is_inplace_(is_inplace) {}
 
   const one::StatefulLocalOpKernel& opkernel() const { return *opkernel_; }
   const one::EagerBlobObjectListPtr& inputs() const { return inputs_; }
   const one::EagerBlobObjectListPtr& outputs() const { return outputs_; }
   const AttrMap& attrs() const { return attrs_; }
+  bool is_inplace() const { return is_inplace_; }
 
   one::StatefulLocalOpKernel* mut_opkernel() { return opkernel_.get(); }
 
@@ -86,6 +92,7 @@ class LocalCallOpKernelPhyInstrOperand final : public vm::PhyInstrOperand {
   one::EagerBlobObjectListPtr outputs_;
   const AttrMap attrs_;
   const user_op::OpKernel* user_opkernel_;
+  bool is_inplace_;
 };
 
 }  // namespace vm

diff --git a/oneflow/core/eager/opkernel_instruction_type.cpp b/oneflow/core/eager/opkernel_instruction_type.cpp
@@ -447,7 +447,7 @@ struct LocalCallOpKernelUtil final {
     operand->set_user_opkernel(
         JUST(operand->mut_opkernel()->ChooseOpKernel(operand->inputs(), operand->outputs())));
     JUST(CheckOutputBlobObjectsMemCase(operand, instruction->stream()));
-    JUST(InitOutputBlobs(operand));
+    if (!operand->is_inplace()) { JUST(InitOutputBlobs(operand)); }
     JUST(InferTempStorageBlobDesc(operand));
     JUST(ResetTempStorageBlob(operand));
     return Maybe<void>::Ok();
@@ -456,7 +456,7 @@ struct LocalCallOpKernelUtil final {
   static inline Maybe<void> Compute(vm::Instruction* instruction) {
     auto* operand = JUST(GetLocalCallOpKernelPhyInstrOperand(instruction));
     DeviceCtx* device_ctx = instruction->stream().device_ctx().get();
-    JUST(AllocateOutputBlobsMemory(operand, device_ctx));
+    if (!operand->is_inplace()) { JUST(AllocateOutputBlobsMemory(operand, device_ctx)); }
     JUST(TryAllocateTempStorageBlobMemory(operand, device_ctx));
     user_op::OpKernelState* state;
     TryInitOpKernelState(operand, device_ctx, &state);

diff --git a/oneflow/core/framework/device.cpp b/oneflow/core/framework/device.cpp
@@ -119,6 +119,11 @@ std::string Device::ToString() const {
   return ss.str();
 }
 
+std::ostream& operator<<(std::ostream& out, const Device& device) {
+  out << device.ToString();
+  return out;
+}
+
 Maybe<const Device> Device::MakeDeviceByParallelDesc(const ParallelDesc& parallel_desc) {
   std::string type = parallel_desc.device_tag();
   if (parallel_desc.device_tag() == "gpu") { type = "cuda"; }

diff --git a/oneflow/core/framework/device.h b/oneflow/core/framework/device.h
@@ -66,6 +66,8 @@ class Device final : public std::enable_shared_from_this<Device> {
   std::shared_ptr<VmLocalDepObject> compute_local_dep_object_;
 };
 
+std::ostream& operator<<(std::ostream& out, const Device& device);
+
 }  // namespace oneflow
 
 namespace std {

diff --git a/oneflow/core/framework/instructions_builder.cpp b/oneflow/core/framework/instructions_builder.cpp
@@ -650,11 +650,11 @@ Maybe<void> InstructionsBuilder::LocalCallOpKernel(
     const one::EagerBlobObjectListPtr& input_eager_blob_objects,
     const one::EagerBlobObjectListPtr& output_eager_blob_objects, const AttrMap& attrs,
     const std::shared_ptr<const ParallelDesc>& parallel_desc_sym,
-    const std::string& instr_type_name) {
+    const std::string& instr_type_name, bool is_inplace) {
   ObjectMsgPtr<vm::InstructionMsg> instruction =
       ObjectMsgPtr<vm::InstructionMsg>::New(instr_type_name);
   auto phy_instr_operand = std::make_shared<vm::LocalCallOpKernelPhyInstrOperand>(
-      opkernel, input_eager_blob_objects, output_eager_blob_objects, attrs);
+      opkernel, input_eager_blob_objects, output_eager_blob_objects, attrs, is_inplace);
   *instruction->mut_parallel_desc() = parallel_desc_sym;
   *instruction->mutable_phy_instr_operand() = phy_instr_operand;
   instruction_list_->EmplaceBack(std::move(instruction));

diff --git a/oneflow/core/framework/instructions_builder.h b/oneflow/core/framework/instructions_builder.h
@@ -249,7 +249,7 @@ class InstructionsBuilder : public std::enable_shared_from_this<InstructionsBuil
                                 const one::EagerBlobObjectListPtr& output_eager_blob_objects,
                                 const AttrMap& attrs,
                                 const std::shared_ptr<const ParallelDesc>& parallel_desc_sym,
-                                const std::string& instr_type_name);
+                                const std::string& instr_type_name, bool is_inplace);
 
  private:
   Maybe<void> RankFrontSeqCallback(const std::string& instruction_name,

@@ -55,59 +55,79 @@ Maybe<void> NaiveInterpret(const UserOpExpr& user_op_expr, const TensorTuple& in
                            TensorTuple* outputs, const AttrMap& attrs) {
   std::shared_ptr<EagerBlobObjectList> input_eager_blob_objects =
       std::make_shared<EagerBlobObjectList>(inputs.size());
-  for (int i = 0; i < inputs.size(); i++) {
+  for (int i = 0; i < inputs.size(); ++i) {
     const auto& input_device = JUST(inputs.at(i)->device());
     if (i > 0) {
       CHECK_OR_RETURN(*default_device == *input_device) << Error::InputDeviceNotMatchError();
     }
     input_eager_blob_objects->at(i) = JUST(inputs.at(i)->eager_blob_object());
   }
-  for (int i = 0; i < outputs->size(); i++) {
+  std::shared_ptr<EagerBlobObjectList> output_eager_blob_objects =
+      std::make_shared<EagerBlobObjectList>(outputs->size());
+  for (int i = 0; i < outputs->size(); ++i) {
     if (!outputs->at(i)) {
       outputs->at(i) =
           std::make_shared<MirroredTensor>(std::make_shared<EagerMirroredTensorImpl>());
     }
+    if (JUST(outputs->at(i)->has_eager_blob_object())) {
+      output_eager_blob_objects->at(i) = JUST(outputs->at(i)->eager_blob_object());
+    }
   }
-  std::shared_ptr<EagerBlobObjectList> output_eager_blob_objects =
-      std::make_shared<EagerBlobObjectList>(outputs->size());
   std::shared_ptr<const Device> op_device;
   std::shared_ptr<const ParallelDesc> op_parallel_desc;
   bool need_check_mem_case = true;
   bool need_event_record = false;
-
-  // Infer devices
-  if (!user_op_expr.has_device_infer_fn()) {
+  bool is_inplace =
+      std::all_of(output_eager_blob_objects->begin(), output_eager_blob_objects->end(),
+                  [](const std::shared_ptr<vm::EagerBlobObject>& eager_blob_object) {
+                    return eager_blob_object != nullptr;
+                  });
+  if (is_inplace) {
+    for (int i = 0; i < outputs->size(); ++i) {
+      CHECK_EQ_OR_RETURN(*JUST(outputs->at(i)->device()), *JUST(inputs.at(i)->device()));
+      output_eager_blob_objects->at(i) = JUST(outputs->at(i)->eager_blob_object());
+      CHECK_EQ_OR_RETURN(output_eager_blob_objects->at(i), input_eager_blob_objects->at(i));
+      CHECK_EQ_OR_RETURN(output_eager_blob_objects->at(i)->blob_desc().shape(),
+                         input_eager_blob_objects->at(i)->blob_desc().shape());
+    }
     op_device = default_device;
     op_parallel_desc = op_device->parallel_desc_ptr();
-    for (int i = 0; i < outputs->size(); i++) {
-      auto* tensor_impl = JUST(TensorImpl4Tensor(outputs->at(i)));
-      *tensor_impl->mut_device() = default_device;
-    }
   } else {
-    need_check_mem_case = false;
-    op_device = JUST(user_op_expr.InferDevices(attrs, inputs, outputs));
-    for (const auto& input_tensor : inputs) {
-      const auto& input_device = JUST(input_tensor->device());
-      need_event_record = need_event_record || !(*op_device == *input_device);
+    // Infer devices
+    if (!user_op_expr.has_device_infer_fn()) {
+      op_device = default_device;
+      op_parallel_desc = op_device->parallel_desc_ptr();
+      for (int i = 0; i < outputs->size(); ++i) {
+        auto* tensor_impl = JUST(TensorImpl4Tensor(outputs->at(i)));
+        *tensor_impl->mut_device() = default_device;
+      }
+    } else {
+      need_check_mem_case = false;
+      op_device = JUST(user_op_expr.InferDevices(attrs, inputs, outputs));
+      for (const auto& input_tensor : inputs) {
+        const auto& input_device = JUST(input_tensor->device());
+        need_event_record = need_event_record || !(*op_device == *input_device);
+      }
+      op_device = default_device;
+      op_parallel_desc = op_device->parallel_desc_ptr();
     }
-    op_parallel_desc = op_device->parallel_desc_ptr();
-  }
 
-  // Infer shapes and dtypes
-  const auto& device_tag = JUST(op_device->of_type());
-  JUST(user_op_expr.InferLogicalShapeAndDType(
-      attrs, device_tag,
-      [&](int32_t i) -> const TensorMeta* {
-        return CHECK_JUST(TensorImpl4Tensor(inputs.at(i)))->tensor_meta().get();
-      },
-      [&](int32_t i) -> TensorMeta* {
-        return CHECK_JUST(TensorImpl4Tensor(outputs->at(i)))->mut_tensor_meta();
-      }));
-
-  for (int i = 0; i < output_eager_blob_objects->size(); i++) {
-    auto* tensor_impl = JUST(TensorImpl4Tensor(outputs->at(i)));
-    JUST(tensor_impl->InitEagerBlobObject(JUST(outputs->at(i)->device())->mem_case()));
-    output_eager_blob_objects->at(i) = JUST(tensor_impl->eager_blob_object());
+    // Infer shapes and dtypes
+    const auto& device_tag = JUST(op_device->of_type());
+    JUST(user_op_expr.InferLogicalShapeAndDType(
+        attrs, device_tag,
+        [&](int32_t i) -> const TensorMeta* {
+          return CHECK_JUST(TensorImpl4Tensor(inputs.at(i)))->tensor_meta().get();
+        },
+        [&](int32_t i) -> TensorMeta* {
+          return CHECK_JUST(TensorImpl4Tensor(outputs->at(i)))->mut_tensor_meta();
+        }));
+
+    for (int i = 0; i < output_eager_blob_objects->size(); ++i) {
+      auto* tensor_impl = JUST(TensorImpl4Tensor(outputs->at(i)));
+      JUST(tensor_impl->InitEagerBlobObject(JUST(outputs->at(i)->device())->mem_case()));
+      output_eager_blob_objects->at(i) = JUST(tensor_impl->eager_blob_object());
+    }
   }
 
   const auto kernel = JUST(user_op_expr.MutKernel4Device(*op_device));
@@ -130,7 +150,7 @@ Maybe<void> NaiveInterpret(const UserOpExpr& user_op_expr, const TensorTuple& in
       }
     }
     return builder->LocalCallOpKernel(kernel, input_eager_blob_objects, output_eager_blob_objects,
-                                      attrs, op_parallel_desc, instr_type_name);
+                                      attrs, op_parallel_desc, instr_type_name, is_inplace);
   }));
   return Maybe<void>::Ok();
 }

@@ -20,6 +20,9 @@ limitations under the License.
 #include "oneflow/core/framework/tensor_tuple.h"
 #include "oneflow/core/autograd/autograd_engine.h"
 #include "oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_expr.h"
 
 namespace oneflow {
 
@@ -62,6 +65,21 @@ Maybe<MirroredTensor> MirroredTensor::api_detach() const {
   return std::make_shared<MirroredTensor>(JUST(impl_->detach()));
 }
 
+Maybe<Tensor> MirroredTensor::clone() const {
+  const auto& device_type = JUST(this->device())->type();
+  int64_t device_id = JUST(this->device())->device_id();
+  std::shared_ptr<OpExpr> copy_op_ = JUST(one::OpBuilder("copy")
+                                              .Input("in", 1)
+                                              .Attr("device_type", device_type)
+                                              .Attr("device_id", device_id)
+                                              .Output("out", 1)
+                                              .Build());
+  std::shared_ptr<MirroredTensor> input =
+      std::const_pointer_cast<MirroredTensor>(shared_from_this());
+  const auto& output = JUST(OpInterpUtil::Dispatch<Tensor>(*copy_op_, {input}));
+  return output;
+}
+
 Maybe<ConsistentTensor> ConsistentTensor::MakeTensor(
     const std::shared_ptr<const Shape>& shape, DataType dtype,
     Symbol<cfg::ParallelDistribution> parallel_distribution, Symbol<ParallelDesc> parallel_desc,