PaddlePaddle · weishengying · Mar 14, 2022 · Mar 7, 2022 · DannyIsFunny · Mar 11, 2022
@@ -41,10 +41,14 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   CHECK(graph) << "graph not valid";
 
   // sort kernels by the factors.
-  VLOG(4) << "graph->mutable_nodes().size():" << graph->mutable_nodes().size();
+  VLOG(2) << "graph block_idx: " << graph->blockIdx();
+  VLOG(2) << "graph->mutable_nodes().size(): " << graph->mutable_nodes().size();
+  size_t idx = 0;
   for (auto& node : graph->mutable_nodes()) {
     if (!node.IsStmt()) continue;
     auto& instruct = node.AsStmt();
+    VLOG(2) << "pick kernel for op : " << instruct.op_type() << ", in block "
+            << graph->blockIdx() << ", idx : " << idx++;
 
     std::map<std::string, PrecisionType> in_types;
     std::map<std::string, PrecisionType> out_types;
@@ -66,17 +70,19 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     std::vector<std::pair<float, std::unique_ptr<KernelBase>>> scored;
     CHECK(!instruct.kernels().empty()) << "No kernels found for "
                                        << instruct.op_type();
-    VLOG(4) << "instruct.kernels().size():" << instruct.kernels().size();
+
+    VLOG(2) << "candidate kernels size:" << instruct.kernels().size();
     for (auto&& kernel : instruct.kernels()) {
-      float score = KernelGrade(instruct,
+      VLOG(2) << "current candidate kernel is: " << kernel->summary();
+      VLOG(2) << "valid_places size is: " << graph->valid_places().size();
+      float score = KernelGrade(&node,
                                 *kernel,
                                 graph->valid_places(),
                                 in_types,
                                 out_types,
                                 instruct.op_info()->input_names(),
                                 instruct.op_info()->output_names());
-      VLOG(4) << "kernel->summary():" << kernel->summary()
-              << " score:" << score;
+
       scored.emplace_back(score, std::move(kernel));
     }
     std::stable_sort(scored.begin(), scored.end(), KernelScoreCmp);
@@ -87,7 +93,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       // Just keep a single best kernel.
       // TODO(Superjomn) reconsider this.
       instruct.kernels().emplace_back(std::move(scored.front().second));
-      VLOG(2) << "pick " << instruct.kernels().front()->summary() << "\n\n";
+      VLOG(2) << "the final pick kernel is "
+              << instruct.kernels().front()->summary() << "\n\n";
 
     } else {
       bool out_type_int8 = true;
@@ -137,7 +144,7 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
         instruct.ResetOp(update_desc, graph->valid_places());
         scored.clear();
         for (auto&& kernel : instruct.kernels()) {
-          float score = KernelGrade(instruct,
+          float score = KernelGrade(&node,
                                     *kernel,
                                     graph->valid_places(),
                                     in_types,

@@ -50,13 +50,14 @@ class StaticKernelPickPass : public mir::StmtPass {
 
  private:
   // Score the kernel.
-  size_t KernelGrade(const lite::mir::Node::Stmt& instruct,
+  size_t KernelGrade(lite::mir::Node* node,
                      const lite::KernelBase& kernel,
                      const std::vector<Place>& places,
                      const std::map<std::string, PrecisionType>& in_types,
                      const std::map<std::string, PrecisionType>& out_types,
                      const std::vector<std::string>& in_names,
                      const std::vector<std::string>& out_names) {
+    const auto& instruct = node->AsStmt();
     CHECK_GT(places.size(), static_cast<size_t>(0)) << "valid_places is empty.";
     float final_score{-1.};
     Place winner_place{places[0]};
@@ -76,14 +77,19 @@ class StaticKernelPickPass : public mir::StmtPass {
     for (size_t i = 0; i < place_size; ++i) {
       const auto& place = places[i];
       float weight = static_cast<float>(place_size - i) / place_size;
+      VLOG(4) << "current place is " << place.DebugString() << ", idx : " << i
+              << ", weight : " << weight;
       size_t score{};
 
       // The more important factor comes first
       if (kernel_pick_factors_.IsTargetConsidered() &&
           (place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
            place.target == TARGET(kAny))) {
-        score += kMax /
-                 static_cast<int>(core::KernelPickFactor::Factor::TargetFirst);
+        size_t target_score =
+            kMax /
+            static_cast<int>(core::KernelPickFactor::Factor::TargetFirst);
+        score += target_score;
+        VLOG(4) << "[TargetConsidered score]:" << target_score;
       }
       VLOG(4) << "[score s1]:" << score;
       if (kernel_pick_factors_.IsPrecisionConsidered() &&
@@ -93,17 +99,23 @@ class StaticKernelPickPass : public mir::StmtPass {
         // score skipped, if kernel is int8, but op is not int8
         if (!(kernel.precision() == PRECISION(kInt8) &&
               !instruct.op_info()->HasAttr("enable_int8"))) {
-          score += kMax / static_cast<int>(
-                              core::KernelPickFactor::Factor::PrecisionFirst);
+          size_t precision_score =
+              kMax /
+              static_cast<int>(core::KernelPickFactor::Factor::PrecisionFirst);
+          score += precision_score;
+          VLOG(4) << "[PrecisionConsidered score]:" << precision_score;
         }
       }
       VLOG(4) << "[score s2]:" << score;
       if (kernel_pick_factors_.IsDataLayoutConsidered() &&
           (place.layout == kernel.layout() ||
            kernel.layout() == DATALAYOUT(kAny) ||
            place.layout == DATALAYOUT(kAny))) {
-        score += kMax / static_cast<int>(
-                            core::KernelPickFactor::Factor::DataLayoutFirst);
+        size_t datalayout_score =
+            kMax /
+            static_cast<int>(core::KernelPickFactor::Factor::DataLayoutFirst);
+        score += datalayout_score;
+        VLOG(4) << "[DataLayoutConsidered score]:" << datalayout_score;
       }
       VLOG(4) << "[score s3]:" << score;
 
@@ -138,10 +150,44 @@ class StaticKernelPickPass : public mir::StmtPass {
         }
         if (type_match) {
           score *= 2;
+          VLOG(4) << "[Input precision compatible]: *2";
         }
         VLOG(4) << "[score s4]:" << score;
       }
 
+      // add new rules for datatype: When the input types are consistent with
+      // kernel's input types, select the kernel of the datatype.
+      if (instruct.op_info()->Type() != "conditional_block" &&
+          instruct.op_info()->Type() != "while" &&
+          instruct.op_info()->Type() != "subgraph") {
+        bool datatype_match = true;
+        for (auto* in : node->inlinks) {
+          if (!in->IsArg()) continue;
+          if (in->AsArg().name == "feed" || in->AsArg().is_persist) continue;
+          std::string argname;
+          instruct.op_info()->GetInputArgname(in->AsArg().name, &argname);
+          VLOG(5) << "intput var name : " << in->AsArg().name;
+          // only when datatype is LOD_TENSOR, LOD_TENSOR_ARRAY, STEP_SCOPES,
+          // the type pointer is not null;
+          if (in->AsArg().type) {
+            VLOG(5) << "input datatype : "
+                    << static_cast<int>(in->AsArg().type->id());
+            VLOG(5) << "kernel bind datatype : "
+                    << static_cast<int>(kernel.GetInputDeclType(argname)->id());
+            if (static_cast<int>(in->AsArg().type->id()) !=
+                static_cast<int>(kernel.GetInputDeclType(argname)->id()))
+              datatype_match = false;
+          } else {
+            datatype_match = false;
+          }
+        }
+        if (datatype_match) {
+          score *= 2;
+          VLOG(4) << "[Input datatype compatible]: *2";
+        }
+        VLOG(4) << "[score s5]:" << score;
+      }
+
       if (weight * score > final_score) {
         final_score = weight * score;
         winner_place = place;
@@ -191,9 +237,8 @@ class StaticKernelPickPass : public mir::StmtPass {
       }
     }
 
-    VLOG(4) << "[score(final)]:" << final_score;
-    VLOG(2) << "-------- pick summary for " << instruct.op_type()
-            << " --------";
+    VLOG(2) << "-------- score summary for candidate kernel : "
+            << kernel.summary() << " --------";
     VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
             << " " << DataLayoutToStr(winner_place.layout) << " "
             << TargetToStr(winner_place.target);
@@ -203,8 +248,8 @@ class StaticKernelPickPass : public mir::StmtPass {
             << TargetToStr(kernel.place().target);
     VLOG(4) << "kernel.op_type():" << kernel.op_type();
     VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
-    VLOG(4) << "kernel place:" << kernel.place().DebugString();
     VLOG(4) << "winner_picker place:" << winner_place.DebugString();
+    VLOG(4) << "[score(final)]:" << final_score;
     VLOG(4) << "------------------------------";
 
     // The data layout is not considered, for the input and output arguments

@@ -606,6 +606,8 @@ void Program::PrepareWorkspace(
         } else if (var_type == lite::VarDescAPI::Type::LOD_TENSOR_ARRAY) {
           var_type_map_[var_name] = LiteType::GetTensorListTy(
               TARGET(kUnk), PRECISION(kUnk), DATALAYOUT(kUnk));
+          auto* tensor_array = var->GetMutable<std::vector<lite::Tensor>>();
+          tensor_array->resize(0);
         } else if (var_type == lite::VarDescAPI::Type::STEP_SCOPES) {
           var->GetMutable<std::vector<lite::Scope*>>();
         }

@@ -60,15 +60,11 @@ namespace lite {
 // We use Types to declare the definition of a kernel, each inputs' and outputs'
 // arguments have a specific Types.
 //
-// REGISTER_LITE_KERNEL(mul, kHost, kFloat,
-//     paddle::lite::kernels::host::MulCompute, def)
-//   .BindInput("X", {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
-//       TARGET(kHost))})
-//   .BindInput("Y", {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
-//       TARGET(kHost))})
-//   .BindOutput("Out",
-//   {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(TARGET(kHost))})
-//   .Finalize();
+// REGISTER_LITE_KERNEL(mul, kARM, kInt8, kNCHW, Mul_int8_f32, def)
+//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+//     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+//     .Finalize();
 //
 // The above definition will be used in MIR by Type inference and uncompatible
 // types check.
@@ -116,13 +112,13 @@ class DataType {
 };
 
 /*
- * Datatype with device info considered.
- * NOTE A Type with different device is treated as different DeviceDataType.
+ * Datatype with Place info considered.
+ * NOTE A Type with different Place info is treated as different Type.
  */
 class Type : public DataType {
  public:
   // Can cast to another type. This is heavily used in MIR, by determine whether
-  // is is possible to add a statement to transform a type to another.
+  // is possible to add a statement to transform a type to another.
   virtual bool TypeCastable(const Type& type) const { return id_ == type.id(); }
 
   /// Get a Tensor type.
@@ -258,30 +254,6 @@ struct ParamType {
   std::string DebugString() const { return type->name(); }
 };
 
-/*
- * The data types of kernel parameters. It is used to track the type of kernel's
- * inputs and outputs.
- */
-struct ParamTypeRecorder {
-  std::map<std::string, ParamType> inputs;
-  std::map<std::string, ParamType> outputs;
-
-  void RegisterInputType(const std::string& arg_name, const ParamType& type) {
-    Register(&inputs, arg_name, type);
-  }
-
-  void RegisterOutputType(const std::string& arg_name, const ParamType& type) {
-    Register(&outputs, arg_name, type);
-  }
-
- private:
-  void Register(std::map<std::string, ParamType>* ts,
-                const std::string& arg_name,
-                ParamType type) {
-    (*ts)[arg_name] = type;
-  }
-};
-
 /*
  * The ParamTypeRegistry help register the input and output data types for all
  * the kernels. It is made singleton so that all the objects of the same kernel
@@ -296,19 +268,19 @@ struct ParamTypeRecorder {
 class ParamTypeRegistry {
  public:
   enum class IO : int { kInvalid = 0, kInput, kOutput };
-
-  template <TargetType target,
-            PrecisionType precision,
-            DataLayoutType layout = DataLayoutType::kNCHW>
   /*
    * Helper class for registering a ParamType for a Kernel.
    * Usage:
    *
    * NewInstance<TARGET(kHost), PRECISION(kFloat)>("fc")
-   *   .BindInput(0, {typeid(Tensor).hash_code(), {TARGET(kHost)})
-   *   .BindInput(1, {typeid(Tensor).hash_code(), {TARGET(kHost),
-   *                                               PRECISION(kFloat)});
+   *   .BindInput("Input_0", {Type::GetTensorTy(TARGET(kHost),
+   * PRECISION(kInt64))})
+   *   .BindInput("Input_1", {Type::GetTensorTy(TARGET(kHost),
+   * PRECISION(kInt64))});
    */
+  template <TargetType target,
+            PrecisionType precision,
+            DataLayoutType layout = DataLayoutType::kNCHW>
   struct NewInstance {
     explicit NewInstance(const std::string& kernel_type)
         : kernel_type_(kernel_type) {}