Merge branch 'develop' into yj/rm_multinode_eager_guard_test

PaddlePaddle · Dec 8, 2022 · 278cf18 · 278cf18
2 parents 3e395ae + 8c41665
commit 278cf18
Show file tree

Hide file tree

Showing 722 changed files with 7,041 additions and 13,953 deletions.
diff --git a/.flake8 b/.flake8
@@ -37,9 +37,3 @@ per-file-ignores =
     .cmake-format.py: F821
     python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py: F821
     python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py: F821
-    # These files will be fixed in the future
-    python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py: F811
-    python/paddle/fluid/tests/unittests/test_activation_nn_grad.py: F811
-    python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py: F811
-    python/paddle/fluid/tests/unittests/test_matmul_v2_op.py: F811
-    python/paddle/fluid/tests/unittests/test_rrelu_op.py: F811
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -14,17 +14,36 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
 
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
 void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const {
+  static constexpr char kNotAllowInferShapeCahce[] =
+      "@NOT_ALLOW_INFERSHAPE_CACHE@";
   VLOG(3) << "Applies Runtime Context Cache strategy.";
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp() && n->Op()) {
-      n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
+      n->Op()->SetAttr(framework::kEnableCacheRuntimeContext, true);
+    }
+  }
+
+  // if op1 -> var0 and op2 -> var0, then op1 and op2 not support
+  // InferShapeCache.
+  std::unordered_map<std::string, std::vector<Node*>> var2ops;
+  for (auto* op_node : TopologySortOperations(*graph)) {
+    for (auto* var_node : op_node->outputs) {
+      var2ops[var_node->Name()].push_back(op_node);
+    }
+  }
+  for (auto& it : var2ops) {
+    if (it.second.size() > 1) {
+      for (auto op_node : it.second) {
+        op_node->Op()->SetAttr(kNotAllowInferShapeCahce, true);
+      }
     }
   }
 }

diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -356,7 +356,8 @@ void StreamAnalyzer::ShrinkEventInfo(
 
 platform::DeviceType StreamAnalyzer::GetWaiterType(
     const Instruction& instr) const {
-  if (instr.KernelType() == OpFuncType::kCpuSync) {
+  if (instr.KernelType() == OpFuncType::kCpuSync ||
+      instr.KernelType() == OpFuncType::kGpuSync) {
     return platform::kCPU;
   } else {
     if (platform::is_xpu_place(place_)) {

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <sstream>
 #include <string>
+#include <unordered_set>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/convert_utils.h"
@@ -36,6 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/ops/compat/signatures.h"
@@ -562,6 +564,14 @@ phi::DenseTensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
   }
 }
 
+OperatorWithKernel::OperatorWithKernel(const std::string& type,
+                                       const VariableNameMap& inputs,
+                                       const VariableNameMap& outputs,
+                                       const AttributeMap& attrs)
+    : OperatorBase(type, inputs, outputs, attrs) {}
+
+OperatorWithKernel::~OperatorWithKernel() = default;
+
 bool ExecutionContext::HasInput(const std::string& name) const {
   auto* var = InputVar(name);
   return var != nullptr;
@@ -1204,19 +1214,54 @@ class RuntimeInferShapeContext : public InferShapeContext {
 };
 
 struct OperatorWithKernel::CacheImpl {
+  static const char kNotAllowInferShapeCahce[];
   explicit CacheImpl(phi::KernelContext* kernel_ctx,
-                     RuntimeInferShapeContext* infer_shape_ctx)
-      : kernel_ctx_(kernel_ctx), infer_shape_ctx_(infer_shape_ctx) {}
+                     RuntimeInferShapeContext* infer_shape_ctx,
+                     const std::vector<phi::DenseTensor*>& tensors,
+                     bool not_allow_infer_shape_cache)
+      : kernel_ctx_(kernel_ctx),
+        infer_shape_ctx_(infer_shape_ctx),
+        tensors_(tensors),
+        not_allow_infer_shape_cache_(not_allow_infer_shape_cache) {}
 
   phi::KernelContext* getKernelContext() { return kernel_ctx_.get(); }
   RuntimeInferShapeContext* getRuntimeInferShapeContext() {
     return infer_shape_ctx_.get();
   }
 
+  bool NeedInferShape() {
+    if (not_allow_infer_shape_cache_) return true;
+
+    bool ret{false};
+    if (last_ddims_.empty() || tensors_.empty()) ret = true;
+    if (!ret) {
+      CHECK_EQ(last_ddims_.size(), tensors_.size());
+      for (size_t i = 0; i < last_ddims_.size(); ++i) {
+        if (tensors_[i]->dims() != last_ddims_[i]) {
+          ret = true;
+          break;
+        }
+      }
+    }
+    if (ret) {
+      last_ddims_.resize(tensors_.size());
+      for (size_t i = 0; i < last_ddims_.size(); ++i) {
+        last_ddims_[i] = tensors_[i]->dims();
+      }
+    }
+    VLOG(3) << "need infer shape is " << ret;
+    return ret;
+  }
+
  private:
   std::unique_ptr<phi::KernelContext> kernel_ctx_;
   std::unique_ptr<RuntimeInferShapeContext> infer_shape_ctx_;
+  std::vector<phi::DenseTensor*> tensors_;
+  bool not_allow_infer_shape_cache_;
+  std::vector<phi::DDim> last_ddims_;
 };
+const char OperatorWithKernel::CacheImpl::kNotAllowInferShapeCahce[] =
+    "@NOT_ALLOW_INFERSHAPE_CACHE@";
 
 static void CheckTensorNANOrInf(const std::string& op_type,
                                 const std::string& name,
@@ -1524,8 +1569,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     pre_scope_ = cur_scope;
   } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
              !need_prepare_phi_data_) {
-    if (!all_kernels_must_compute_runtime_shape_)
+    if (!all_kernels_must_compute_runtime_shape_ && impl_->NeedInferShape()) {
       this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
+    }
     (*phi_kernel_)(impl_->getKernelContext());
   } else {
     if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
@@ -1828,9 +1874,31 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       phi::KernelContext phi_kernel_context;
       if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
           !need_prepare_data_) {
-        impl_ =
+        // TODO(inference): Now we only suppor dense_tensor cache, we may be
+        // support ScalarTensor, SparseTensor in future.
+        bool all_dense_tensor_input_{true};
+        for (auto& iter : Inputs()) {
+          for (auto& name : iter.second) {
+            all_dense_tensor_input_ &=
+                scope.FindVar(name)->IsType<phi::DenseTensor>();
+          }
+        }
+
+        std::vector<phi::DenseTensor*> tensors;
+        if (all_dense_tensor_input_) {
+          for (auto& iter : Inputs()) {
+            for (auto& name : iter.second) {
+              auto* t = scope.FindVar(name)->GetMutable<phi::DenseTensor>();
+              tensors.push_back(t);
+            }
+          }
+        }
+
+        impl_.reset(
             new CacheImpl(new phi::KernelContext(),
-                          new RuntimeInferShapeContext(*this, *runtime_ctx));
+                          new RuntimeInferShapeContext(*this, *runtime_ctx),
+                          tensors,
+                          HasAttr(CacheImpl::kNotAllowInferShapeCahce)));
         BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
         (*phi_kernel_)(impl_->getKernelContext());
       } else {
@@ -3246,6 +3314,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
   if (phi::OneDNNContext::classof(dev_ctx)) {
     phi::OneDNNContext* one_dnn_ctx = static_cast<phi::OneDNNContext*>(dev_ctx);
     one_dnn_ctx->ClearDnnAttr();
+    if (!RuntimeAttrs().empty()) need_prepare_phi_data_ = true;
   }
 #endif
 
@@ -3267,7 +3336,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
 #if defined(PADDLE_WITH_MKLDNN) || defined(PADDLE_WITH_CUDA)
   auto& runtime_attrs = RuntimeAttrs();
   for (const auto& attr_iter : runtime_attrs) {
-    need_prepare_phi_data_ = true;
     auto& attr_name = attr_iter.first;
     auto& attr = attr_iter.second;
     auto attr_propertys = paddle::operators::GetExtraAttrProperties(attr_name);

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
@@ -612,8 +612,9 @@ class OperatorWithKernel : public OperatorBase {
   OperatorWithKernel(const std::string& type,
                      const VariableNameMap& inputs,
                      const VariableNameMap& outputs,
-                     const AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+                     const AttributeMap& attrs);
+
+  virtual ~OperatorWithKernel();
 
   static paddle::flat_hash_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
@@ -785,8 +786,9 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<phi::Kernel> phi_kernel_;
   mutable std::unique_ptr<phi::ArgumentMappingFn> arg_map_fn_;
 
+ private:
   struct CacheImpl;
-  mutable CacheImpl* impl_{nullptr};
+  mutable std::unique_ptr<CacheImpl> impl_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);

diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -484,7 +484,8 @@ void AnalyseClusterVariables(
     const std::unordered_set<std::string>& deny_var_set,
     GraphNodeSet* cluster_inputs,
     GraphNodeSet* cluster_outputs,
-    GraphNodeSet* cluster_internals) {
+    GraphNodeSet* cluster_internals,
+    bool is_inference_stage) {
   // collecting all input and output of op
   for (auto* op_node : cluster) {
     const auto& op_name = op_node->Name();
@@ -523,6 +524,18 @@ void AnalyseClusterVariables(
   for (auto* var_node : *cluster_internals) {
     cluster_outputs->erase(var_node);
   }
+
+  if (is_inference_stage) {
+    // If part of the output of the Op is not used by other operators, change it
+    // to internal. such as transpose2 op's XShape out.
+    auto outs = *cluster_outputs;
+    for (auto* node : outs) {
+      if (node->outputs.empty()) {
+        cluster_outputs->erase(node);
+        cluster_internals->insert(node);
+      }
+    }
+  }
 }
 
 void AddLinkToCinnOp(const GraphNodeSet& cluster_inputs,
@@ -611,7 +624,7 @@ void ReplaceSubGraphWithCinnOpNode(
 // Here we using SubgraphDetector to detecte the subgraph that
 // all of op node supported by CINN. We using OpMapperRegistry
 // to check whether the op node supported by CINN.
-void SearchAllSubgraphs(Graph* graph) {
+void SearchAllSubgraphs(Graph* graph, bool is_inference_stage) {
   auto allow_ops = StringSplit(FLAGS_allow_cinn_ops, kDelim);
   auto deny_ops = StringSplit(FLAGS_deny_cinn_ops, kDelim);
   OpTransInfo trans_info;
@@ -671,7 +684,8 @@ void SearchAllSubgraphs(Graph* graph) {
                             deny_var_set,
                             &cluster_inputs,
                             &cluster_outputs,
-                            &cluster_internals);
+                            &cluster_internals,
+                            is_inference_stage);
 
     VLOG(4) << "Cluster Ops: " << cluster_debug_info(cluster_set);
     VLOG(4) << "Cluster input vars: " << cluster_debug_info(cluster_inputs);
@@ -698,7 +712,13 @@ void SearchAllSubgraphs(Graph* graph) {
 }
 }  // namespace
 
-void BuildCinnPass::ApplyImpl(Graph* graph) const { SearchAllSubgraphs(graph); }
+void BuildCinnPass::ApplyImpl(Graph* graph) const {
+  bool is_inference_stage{false};
+  if (Has("is_inference_stage")) {
+    is_inference_stage = Get<bool>("is_inference_stage");
+  }
+  SearchAllSubgraphs(graph, is_inference_stage);
+}
 
 }  // namespace paddle2cinn
 }  // namespace framework

diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -644,11 +644,11 @@ void GradientAccumulator::CallGradientHooks() {
       true,
       platform::errors::PreconditionNotMet(
           "Only can call gradient hooks after sum gradient completed."));
-  PADDLE_ENFORCE_EQ(
-      HasInnerVar(),
-      true,
-      platform::errors::PreconditionNotMet(
-          "Leaf Tensor's inner var is nullptr when call gradient hook."));
+  PADDLE_ENFORCE_EQ(HasInnerVar(),
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "Leaf Tensor's inner var is nullptr when "
+                        "call gradient hook."));
   PADDLE_ENFORCE_EQ(
       inner_var_->Var().IsInitialized(),
       true,

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
@@ -368,6 +368,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(enable_gpu_half, EnableGPUHalf, bool);
   DECL_ARGUMENT_FIELD(mixed_precision_mode, MixedPrecisionMode, int);
 
+  // cinn compiler related
+  DECL_ARGUMENT_FIELD(use_cinn_compiler, UseCinnCompiler, bool);
+
  private:
   std::unordered_set<std::string> valid_fields_;
 };

diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -235,6 +235,8 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new framework::ProgramDesc *(&argument->main_program()));
     } else if (pass_name == "memory_optimize_pass") {
       pass->Set("root_predictor_id", new int(argument->root_predictor_id()));
+    } else if (pass_name == "build_cinn_pass") {
+      pass->Set("is_inference_stage", new bool(argument->use_cinn_compiler()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool lite_enable_int8 =

diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -23,6 +23,8 @@ namespace inference {
 namespace analysis {
 
 void IrGraphToProgramPass::RunImpl(Argument *argument) {
+  auto cache_pass =
+      framework::ir::PassRegistry::Instance().Get("runtime_context_cache_pass");
   auto pass =
       framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
 
@@ -31,14 +33,12 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
               new int(argument->memory_optim_sort_kind()));
   }
 
-  std::unique_ptr<framework::ir::Graph> graph(argument->main_graph_ptr());
-
   // Direct using ProgramDesc desc(argument->main_program()) may cause
   // incomplete copies of information.
   framework::ProgramDesc desc;
   desc.CopyFrom(*argument->main_program().Proto());
   pass->SetNotOwned("program", &desc);
-  pass->Apply(graph.release());  // the argument still own the graph.
+  pass->Apply(cache_pass->Apply(argument->main_graph_ptr()));
 
   argument->SetIrAnalyzedProgram(
       new framework::proto::ProgramDesc(*desc.Proto()));