PaddlePaddle · thisjiang · Nov 23, 2022 · Nov 22, 2022
diff --git a/cinn/frontend/net_builder.cc b/cinn/frontend/net_builder.cc
@@ -140,6 +140,7 @@ NETBUILDER_UNARY_OP_DEF(Negative, negative)
 NETBUILDER_UNARY_OP_DEF(Sign, sign)
 NETBUILDER_UNARY_OP_DEF(Abs, abs)
 NETBUILDER_UNARY_OP_DEF(Clz, clz)
+NETBUILDER_UNARY_OP_DEF(Popc, popc)
 
 #undef NETBUILDER_UNARY_OP_DEF
 

diff --git a/cinn/frontend/net_builder.h b/cinn/frontend/net_builder.h
@@ -72,7 +72,8 @@ namespace frontend {
   macro__(Negative) \
   macro__(Sign) \
   macro__(Abs) \
-  macro__(Clz)
+  macro__(Clz) \
+  macro__(Popc)
 
 // ******************************************* //
 // The op has two input and one output, with a attribute [axis]

diff --git a/cinn/hlir/op/contrib/CMakeLists.txt b/cinn/hlir/op/contrib/CMakeLists.txt
@@ -17,6 +17,7 @@ gather_srcs(cinnapi_src SRCS
         lookup_table.cc
         one_hot.cc
         clz.cc
+        popc.cc
         )
 
 cc_test(test_cast SRCS cast_test.cc DEPS cinncore)
@@ -34,3 +35,4 @@ cc_test(test_repeat SRCS repeat_test.cc DEPS cinncore)
 cc_test(test_one_hot SRCS one_hot_test.cc DEPS cinncore)
 cc_test(test_lookup_table SRCS lookup_table_test.cc DEPS cinncore)
 cc_test(test_clz SRCS clz_test.cc DEPS cinncore)
+cc_test(test_popc SRCS popc_test.cc DEPS cinncore)
diff --git a/cinn/hlir/op/contrib/popc.cc b/cinn/hlir/op/contrib/popc.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cinn/common/cas.h"
+#include "cinn/common/common.h"
+#include "cinn/common/context.h"
+#include "cinn/common/macros.h"
+#include "cinn/common/target.h"
+#include "cinn/hlir/framework/node.h"
+#include "cinn/hlir/framework/op.h"
+#include "cinn/hlir/framework/op_strategy.h"
+#include "cinn/hlir/op/contrib/clip.h"
+#include "cinn/hlir/pe/ir_schedule_pe.h"
+#include "cinn/hlir/pe/nn.h"
+#include "cinn/hlir/pe/schedule.h"
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/ir_operators.h"
+#include "cinn/ir/tensor.h"
+#include "cinn/lang/builtin.h"
+#include "cinn/lang/compute.h"
+#include "gflags/gflags.h"
+
+DECLARE_bool(cinn_ir_schedule);
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+using common::_CINNValuePack_;
+using common::CINNValue;
+using common::CINNValuePack;
+using framework::OpStrategy;
+using framework::shape_t;
+using framework::StrategyFunction;
+
+ir::Tensor Popc(const ir::Tensor &input, const Target &target, const std::string &output_name) {
+  std::string extern_func = "cinn_";
+  if (target == common::DefaultHostTarget()) {
+    extern_func += "host_";
+  } else if (target == common::DefaultNVGPUTarget()) {
+    extern_func += "nvgpu_";
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  extern_func += "popc";
+
+  if (input->type().is_int(32) || input->type().is_uint(32)) {
+    extern_func += "_int32";
+  } else if (input->type().is_int(64) || input->type().is_uint(64)) {
+    extern_func += "_int64";
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+
+  return Compute(
+      input->shape,
+      [=](const std::vector<Expr> &indices) {
+        Expr e = input(indices);
+        return lang::CallExtern(extern_func, {e});
+      },
+      output_name);
+}
+
+std::shared_ptr<OpStrategy> StrategyForPopc(const framework::NodeAttr &attrs,
+                                            const std::vector<ir::Tensor> &inputs,
+                                            const std::vector<Type> &out_type,
+                                            const std::vector<std::vector<int>> &output_shapes,
+                                            const Target &target) {
+  std::string op_name("popc");
+
+  framework::CINNCompute popc_compute([=](lang::Args args, lang::RetValue *ret) {
+    CHECK(!args.empty()) << "The input argument of " << op_name << " compute is empty! Please check.\n";
+    CINNValuePack pack_args = args[0];
+    CHECK(!pack_args.empty()) << "at least one input tensor for " << op_name << " compute\n";
+
+    std::string tensor_name = UniqName(op_name + "_Out");
+    if (FLAGS_cinn_ir_schedule) {
+      CHECK_EQ(pack_args.size(), 2);
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
+
+    Expr A_expr = pack_args[0];
+    CHECK(A_expr.as_tensor());
+    ir::Tensor A = A_expr.as_tensor_ref();
+    auto out     = Popc(A, target, tensor_name);
+    auto stages  = CreateStages({out});
+    *ret         = CINNValuePack{{CINNValue(Expr(out.get())), CINNValue(stages)}};
+  });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(popc_compute, framework::GetInjectiveScheduleFunc(output_shapes, target), "strategy.popc.x86", 1);
+  return strategy;
+}
+
+std::vector<framework::shape_t> InferShapeForPopc(const std::vector<framework::shape_t> &inputs_shape,
+                                                  const framework::AttrMapType &attrs) {
+  CHECK(!inputs_shape.empty() && !inputs_shape[0].empty()) << "The input's shape size is 0! Please check again.";
+  std::vector<framework::shape_t> res{inputs_shape[0]};
+  return res;
+}
+
+std::vector<Type> InferDtypeForPopc(const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
+  CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
+  std::vector<Type> res{inputs_type[0]};
+  return res;
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(popc_ops) {
+  CINN_REGISTER_OP(popc)
+      .describe("Population count.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForPopc)
+      .set_attr("infershape", MakeOpFunction(cinn::hlir::op::InferShapeForPopc))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForPopc))
+      .set_attr<cinn::hlir::framework::OpPatternKind>("OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
+      .set_support_level(4);
+
+  return true;
+}
diff --git a/cinn/hlir/op/contrib/popc.h b/cinn/hlir/op/contrib/popc.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "cinn/ir/ir.h"
+#include "cinn/ir/ir_base.h"
+#include "cinn/ir/tensor.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+
+ir::Tensor Popc(const ir::Tensor& input, const Target& target, const std::string& output_name = "T_Popc_out");
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/cinn/hlir/op/contrib/popc_test.cc b/cinn/hlir/op/contrib/popc_test.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cinn/hlir/op/contrib/popc.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "cinn/backends/codegen_c.h"
+#include "cinn/backends/codegen_c_x86.h"
+#include "cinn/backends/codegen_cuda_dev.h"
+#include "cinn/common/context.h"
+#include "cinn/lang/lower.h"
+#include "cinn/lang/placeholder.h"
+#include "cinn/poly/stage.h"
+
+namespace cinn {
+namespace hlir {
+namespace op {
+namespace {
+bool IsCompiledWithCUDA() {
+#if !defined(CINN_WITH_CUDA)
+  return false;
+#else
+  return true;
+#endif
+}
+}  // namespace
+
+TEST(GenerateCode_Cpu, Popc) {
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultHostTarget();
+  lang::Placeholder<int> in("in", std::vector<int>{10});
+  ir::Tensor res = Popc(in, target, "test_popc");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCpu_Popc", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CPU codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("Popc_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+
+  backends::CodeGenCX86 codegen(target, backends::CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  std::string code = codegen.Compile(builder.Build(), backends::CodeGenC::OutputKind::CImpl);
+  VLOG(6) << "Cpu Codegen result:";
+  VLOG(6) << code;
+}
+
+TEST(GenerateCode_Cuda, Popc) {
+  if (!IsCompiledWithCUDA()) {
+    return;
+  }
+  common::Context::Global().ResetNameId();
+
+  common::Target target = common::DefaultNVGPUTarget();
+
+  lang::Placeholder<int64_t> in("in", std::vector<int>{10});
+  ir::Tensor res = Popc(in, target, "test_popc");
+
+  poly::StageMap stages = poly::CreateStages({res});
+  stages[res]->Bind(0, "blockIdx.x");
+  stages[res]->SetBuffer("global");
+
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerVec("TestGenerateCodeCuda_Popc", stages, {res}, {}, {}, nullptr, target, true);
+
+  VLOG(6) << "Expr before CUDA codegen:";
+  VLOG(6) << funcs[0]->body;
+
+  ir::Module::Builder builder("Popc_Module", target);
+  for (auto& f : funcs) {
+    builder.AddFunction(f);
+  }
+}
+
+}  // namespace op
+}  // namespace hlir
+}  // namespace cinn
diff --git a/cinn/hlir/op/use_ops.h b/cinn/hlir/op/use_ops.h
@@ -39,3 +39,4 @@ CINN_USE_REGISTER(repeat_ops)
 CINN_USE_REGISTER(one_hot_ops)
 CINN_USE_REGISTER(lookup_table_ops)
 CINN_USE_REGISTER(clz_ops)
+CINN_USE_REGISTER(popc_ops)
diff --git a/cinn/pybind/frontend.cc b/cinn/pybind/frontend.cc
@@ -621,7 +621,8 @@ void BindFrontend(pybind11::module *m) {
       .def("arange", &NetBuilder::Arange, py::arg("start"), py::arg("end"), py::arg("step"), py::arg("dtype"))
       .def("gather", &NetBuilder::Gather, py::arg("x"), py::arg("index"), py::arg("axis"))
       .def("gather_nd", &NetBuilder::GatherNd, py::arg("x"), py::arg("index"), py::arg("axes"))
-      .def("clz", &NetBuilder::Clz, py::arg("x"));
+      .def("clz", &NetBuilder::Clz, py::arg("x"))
+      .def("popc", &NetBuilder::Popc, py::arg("x"));
 
   auto computation = py::class_<CinnComputation, std::shared_ptr<CinnComputation>>(*m, "Computation");
   py::class_<CinnComputation::CompileOptions>(computation, "CompileOptions")

diff --git a/cinn/runtime/cpu/host_intrinsics.cc b/cinn/runtime/cpu/host_intrinsics.cc
@@ -128,12 +128,16 @@ inline int FN_INT32(pow)(int x, int y) {
 
 inline int FN_INT32(clz)(int x) { return __builtin_clz(x); }
 
+inline int FN_INT32(popc)(int x) { return __builtin_popcount(x); }
+
 #undef FN_INT32
 
 #define FN_INT64(func) cinn_host_##func##_int64
 
 inline int64_t FN_INT64(clz)(int64_t x) { return __builtin_clzll(x); }
 
+inline int64_t FN_INT64(popc)(int64_t x) { return __builtin_popcountll(x); }
+
 #undef FN_INT64
 }
 
@@ -190,6 +194,10 @@ CINN_REGISTER_HELPER(host_intrinsics) {
 
   REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_clz_int64, host_target, int64_t, int64_t);
 
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_popc_int32, host_target, int, int);
+
+  REGISTER_EXTERN_FUNC_1_IN_1_OUT(cinn_host_popc_int64, host_target, int64_t, int64_t);
+
   REGISTER_EXTERN_FUNC_HELPER(cinn_host_find_int, host_target)
       .SetRetType<int>()
       .AddInputType<cinn_buffer_t*>()

diff --git a/cinn/runtime/cpu/host_intrinsics.h b/cinn/runtime/cpu/host_intrinsics.h
@@ -51,11 +51,15 @@ inline int FN_INT32(pow)(int x, int y);
 
 inline int FN_INT32(clz)(int x);
 
+inline int FN_INT32(popc)(int x);
+
 #undef FN_INT32
 
 #define FN_INT64(func) cinn_host_##func##_int64
 
 inline int64_t FN_INT64(clz)(int64_t x);
 
+inline int64_t FN_INT64(popc)(int64_t x);
+
 #undef FN_INT64
 }
diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -68,13 +68,15 @@ __device__ inline int FN_INT32(bitwise_or)(int a, int b) { return a | b; }
 __device__ inline int FN_INT32(bitwise_xor)(int a, int b) { return a ^ b; }
 __device__ inline int FN_INT32(bitwise_not)(int a) { return ~a; }
 __device__ inline int FN_INT32(clz)(int a) { return __clz(a); }
+__device__ inline int FN_INT32(popc)(int a) { return __popc(a); }
 
 
 // *************************************************************** //
 // int64 unary and binary operator
 #define FN_INT64(func) cinn_nvgpu_##func##_int64
 
 __device__ inline long long int FN_INT64(clz)(long long int a) { return __clzll(a); }
+__device__ inline long long int FN_INT64(popc)(long long int a) { return __popcll(a); }
 
 
 // *************************************************************** //

diff --git a/cinn/runtime/cuda/cuda_intrinsics.cc b/cinn/runtime/cuda/cuda_intrinsics.cc
@@ -91,13 +91,15 @@ CINN_REGISTER_HELPER(cuda_intrinsics) {
 
   REGISTER_EXTERN_FUNC_1_IN_1_INT32(bitwise_not)
   REGISTER_EXTERN_FUNC_1_IN_1_INT32(clz)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT32(popc)
 
 #undef REGISTER_EXTERN_FUNC_1_IN_1_INT32
 
 #define REGISTER_EXTERN_FUNC_1_IN_1_INT64(func__) \
   REGISTER_EXTERN_SOURCE_FUNC_1_IN_1_OUT(cinn_nvgpu_##func__##_int64, target, int64_t, int64_t);
 
   REGISTER_EXTERN_FUNC_1_IN_1_INT64(clz)
+  REGISTER_EXTERN_FUNC_1_IN_1_INT64(popc)
 
 #undef REGISTER_EXTERN_FUNC_1_IN_1_INT64