Tencent · ZaoZhe6666 · May 12, 2022 · May 12, 2022 · May 12, 2022 · Jul 6, 2022
diff --git a/platforms/ios/tnn.xcodeproj/project.pbxproj b/platforms/ios/tnn.xcodeproj/project.pbxproj
@@ -4832,6 +4832,7 @@
 				9DAD98952886B3A700170339 /* metal_batch_norm_layer_acc.mm in Sources */,
 				9DAD94462886B1D400170339 /* scatter_layer.cc in Sources */,
 				9DAD956A2886B22300170339 /* CONV_DW_3X3S1_INT8_SDOT_SLIDEW.S in Sources */,
+				369005C5267314D900412264 /* pad_utils.cc in Sources */,
 				9DAD97842886B39600170339 /* metal_mat_converter.metal in Sources */,
 				360F96A62966A0B800E3695A /* graph_matcher.cc in Sources */,
 				9D32FCBE24557EEC002DCDAB /* cpu_utils.cc in Sources */,

diff --git a/source/tnn/core/default_network.cc b/source/tnn/core/default_network.cc
@@ -573,7 +573,7 @@ Status DefaultNetwork::Forward() {
         std::vector<Blob *> outputs = layer->GetOutputBlobs();
 
         {
-            
+
 #if DUMP_INPUT_BLOB
             if (runtime_model_ == RUNTIME_MODE_NORMAL) {
                 // InputBlob data in dumped into files in NCHW_FLOAT format as default

diff --git a/source/tnn/device/arm/acc/arm_binary_layer_acc.cc b/source/tnn/device/arm/acc/arm_binary_layer_acc.cc
@@ -108,11 +108,47 @@ Float4 binary_op<ArmBinaryOpType::kHARDSWISH, Float4>(const Float4 &a, const Flo
     return a * Float4::max(Float4::min(b * alpha + beta, 1.0f), 0.f);
 }
 
+template <>
+float binary_op<ArmBinaryOpType::kEQUAL, float>(const float &a, const float &b, float alpha, float beta) {
+    return a == b ? 1.0f : 0.f;
+}
+template <>
+bfp16_t binary_op<ArmBinaryOpType::kEQUAL, bfp16_t>(const bfp16_t &a, const bfp16_t &b, float alpha, float beta) {
+    return static_cast<float>(a) == static_cast<float>(b) ? 1.0f : 0.f;
+}
+template <>
+float binary_op<ArmBinaryOpType::kGREATER, float>(const float &a, const float &b, float alpha, float beta) {
+    return a > b ? 1.0f : 0.f;
+}
+template <>
+bfp16_t binary_op<ArmBinaryOpType::kGREATER, bfp16_t>(const bfp16_t &a, const bfp16_t &b, float alpha, float beta) {
+    return static_cast<float>(a) > static_cast<float>(b) ? 1.0f : 0.f;
+}
+template <>
+Float4 binary_op<ArmBinaryOpType::kEQUAL, Float4>(const Float4 &a, const Float4 &b, float alpha, float beta) {
+    Float4 dst;
+    for (int i = 0; i < 4; ++i) {
+        dst.value[i] = (a.value[i] == b.value[i] ? 1.0f : 0.f);
+    }
+    return dst;
+}
+template <>
+Float4 binary_op<ArmBinaryOpType::kGREATER, Float4>(const Float4 &a, const Float4 &b, float alpha, float beta) {
+    Float4 dst;
+    for (int i = 0; i < 4; ++i) {
+        dst.value[i] = a.value[i] > b.value[i] ? 1.0f : 0.f;
+    }
+    return dst;
+}
+
 Status ArmBinaryLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
     desc_for_config_const_blob_ = outputs[0]->GetBlobDesc();
     RETURN_ON_NEQ(ArmLayerAcc::Init(context, param, resource, inputs, outputs), TNN_OK);
-    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT
+     || outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT32) {
         RETURN_ON_NEQ(allocateBufferParam(inputs, outputs), TNN_OK);
     }
 #if TNN_ARM82
@@ -221,7 +257,8 @@ Status ArmBinaryLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::
 
 // SUPPORTED DATATYPES
 bool ArmBinaryLayerAcc::DataTypeSupported(DataType data_type) {
-    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_HALF || data_type == DATA_TYPE_BFP16)
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_HALF || data_type == DATA_TYPE_BFP16
+     || data_type == DATA_TYPE_INT32)
         return true;
     else
         return false;
@@ -269,7 +306,7 @@ Status ArmBinaryLayerAcc::allocateBufferParam(const std::vector<Blob *> &inputs,
         auto data_byte_size = DataTypeUtils::GetBytesSize(element_handle.GetDataType());
         auto layer_data     = element_handle.force_to<void *>();
         if (element_handle.GetDataType() == DATA_TYPE_FLOAT) {
-            if (layer_res_size == 1) {
+            if (layer_res_size == 1 || dims_pad.size() == 0 ){
                 // broadcast single, just memcpy
                 RawBuffer temp(4 * layer_res_size * data_byte_size);
                 memcpy(temp.force_to<void *>(), layer_data, layer_res_size * data_byte_size);
@@ -290,6 +327,7 @@ Status ArmBinaryLayerAcc::allocateBufferParam(const std::vector<Blob *> &inputs,
                     hw_stride = DimsVectorUtils::Count(dims_pad, 2);
                 }
                 RawBuffer temp(count * data_byte_size);
+
                 DataFormatConverter::ConvertFromNCHWToNCHW4Float(
                     static_cast<float *>(layer_data), temp.force_to<float *>(), dims_pad[0], channel, hw_stride, 1);
                 broadcast_ = temp;
@@ -311,6 +349,7 @@ Status ArmBinaryLayerAcc::allocateBufferParam(const std::vector<Blob *> &inputs,
         }
     }
 
+
     return TNN_OK;
 }
 
@@ -331,7 +370,6 @@ Status ArmBinaryLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vec
         auto output_ptr = GetBlobHandlePtr(output->GetHandle());
         auto input0_ptr = input_ptrs_[0];
         auto input1_ptr = input_ptrs_[1];
-
         // input0_shape != output_shape && input1_shape != output_shape -> general impl
         if (!DimsVectorUtils::Equal(output_dims, input_shapes_[0]) &&
             !DimsVectorUtils::Equal(output_dims, input_shapes_[1])) {
@@ -361,7 +399,6 @@ Status ArmBinaryLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vec
             BinaryFunc<T, op_type>(output_ptr, output_ptr, input_ptr, output_dims, input0_pad_shape, alpha_, beta_);
         }
     }
-
     return TNN_OK;
 }
 
@@ -412,7 +449,10 @@ Status ArmBinaryLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std
                 return Exec<float, ArmBinaryOpType::kMIN>(inputs, outputs);
             case ArmBinaryOpType::kHARDSWISH:
                 return Exec<float, ArmBinaryOpType::kHARDSWISH>(inputs, outputs);
-
+            case ArmBinaryOpType::kEQUAL:
+                return Exec<float, ArmBinaryOpType::kEQUAL>(inputs, outputs);
+            case ArmBinaryOpType::kGREATER:
+                return Exec<float, ArmBinaryOpType::kGREATER>(inputs, outputs);
             default:
                 LOGE("Error, unknown binary op_type\n");
                 return TNNERR_LAYER_ERR;
@@ -439,12 +479,22 @@ Status ArmBinaryLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std
                 return TNNERR_LAYER_ERR;
         }
     } else if (data_type == DATA_TYPE_INT8) {
-        if (op_type_ == ArmBinaryOpType::kADD) {
-            return ExecInt8(inputs, outputs);
-        } else {
-            LOGE("Error, int8 binary op only support add\n");
-            return TNNERR_LAYER_ERR;
+        switch (op_type_) {
+            case ArmBinaryOpType::kADD:
+                return ExecInt8(inputs, outputs);
+            case ArmBinaryOpType::kEQUAL:
+                outputs[0]->GetBlobDesc().data_type = DATA_TYPE_FLOAT;
+                Exec<float, ArmBinaryOpType::kEQUAL>(inputs, outputs);
+                break;
+            case ArmBinaryOpType::kGREATER:
+                outputs[0]->GetBlobDesc().data_type = DATA_TYPE_FLOAT;
+                Exec<float, ArmBinaryOpType::kGREATER>(inputs, outputs);
+                break;
+            default:
+                LOGE("Error, int8 binary op only support add\n");
+                return TNNERR_LAYER_ERR;
         }
+        return TNN_OK;
     }
 #if TNN_ARM82
     else if (data_type == DATA_TYPE_HALF) {

diff --git a/source/tnn/device/arm/acc/arm_binary_layer_acc.h b/source/tnn/device/arm/acc/arm_binary_layer_acc.h
@@ -27,6 +27,8 @@ enum class ArmBinaryOpType : int {
     kMAX = 4,
     kMIN = 5,
     kHARDSWISH = 6,
+    kEQUAL     = 7,
+    kGREATER   = 8,
 };
 
 class ArmBinaryLayerAcc : public ArmLayerAcc {
@@ -41,6 +43,9 @@ class ArmBinaryLayerAcc : public ArmLayerAcc {
     template <typename T, ArmBinaryOpType op_type>
     Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
 
+    template <typename T_IN, typename T_OUT>
+    Status TransDataType(void *data, const DimsVector &shapes);
+
     // int8 will be implemented inside op
     virtual Status ExecInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
 

diff --git a/source/tnn/device/arm/acc/arm_cast_layer_acc.cc b/source/tnn/device/arm/acc/arm_cast_layer_acc.cc
@@ -41,18 +41,21 @@ Status ArmCastLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::
         if (output_dims.size() > 1) {
             channel = output_dims[1];
         }
+
         count = count / channel;
         count = count * ROUND_UP(channel, 4);
     }
 
     if (input_data_type == output_data_type) {
         if (output_data_type == DATA_TYPE_FLOAT ||
             output_data_type == DATA_TYPE_BFP16 ||
-            output_data_type == DATA_TYPE_INT32) {
+            output_data_type == DATA_TYPE_INT32 ||
+            output_data_type == DATA_TYPE_INT8) {
             if (output_data != input_data) {
                 memcpy(output_data, input_data, count * ele_size);
             }
         } else {
+            LOGE("Unsupported data type in cast %d\n", (int)output_data_type);
             return Status(TNNERR_LAYER_ERR, "Unsupported data type in cast");
         }
     } else {
@@ -70,7 +73,22 @@ Status ArmCastLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::
             for(int i = 0; i < count; ++i) {
                 output_data_ptr[i] = static_cast<int>(input_data_ptr[i]);
             }
+        } else if (input_data_type == DATA_TYPE_FLOAT &&
+                   output_data_type == DATA_TYPE_INT8) {
+            auto *input_data_ptr = (float *)input_data;
+            auto *output_data_ptr = (int8_t *)output_data;
+            for(int i = 0; i < count; ++i) {
+                output_data_ptr[i] = static_cast<int8_t>(input_data_ptr[i]);
+            }
+        } else if (input_data_type == DATA_TYPE_INT32 &&
+                   output_data_type == DATA_TYPE_INT8) {
+            auto *input_data_ptr = (int *)input_data;
+            auto *output_data_ptr = (int8_t *)output_data;
+            for(int i = 0; i < count; ++i) {
+                output_data_ptr[i] = static_cast<int8_t>(input_data_ptr[i]);
+            }
         } else {
+            LOGE("Unsupported data type in cast input=%d output=%d\n", (int)input_data_type, (int)output_data_type);
             return Status(TNNERR_LAYER_ERR, "Unsupported data type in cast");
         }
     }

diff --git a/source/tnn/device/arm/acc/arm_concat_layer_acc.cc b/source/tnn/device/arm/acc/arm_concat_layer_acc.cc
@@ -18,6 +18,7 @@
 #include "tnn/utils/dims_utils.h"
 #include "tnn/utils/naive_compute.h"
 #include "tnn/utils/data_type_utils.h"
+#include <math.h>
 
 namespace TNN_NS {
 
@@ -216,7 +217,12 @@ static int concat_common_i8(Blob *output, const std::vector<Blob *> &inputs, int
 }
 
 static DimsVector GetCXRoundDims(const DimsVector &dims, const int round) {
-    DimsVector round_dims = {dims[0], UP_DIV(dims[1], round)};
+    DimsVector round_dims = {dims[0]};
+    if (dims.size() < 2) {
+        round_dims.push_back(0);
+    } else {
+        round_dims.push_back(UP_DIV(dims[1], round));
+    }
     for (int i = 2; i < dims.size(); ++i) {
         round_dims.push_back(dims[i]);
     }
@@ -242,6 +248,8 @@ static int concat_common(Blob *output, const std::vector<Blob *> &inputs, int ax
             auto input_dims             = input->GetBlobDesc().dims;
             DimsVector round_input_dims = GetCXRoundDims(input_dims, 4);
             auto input_stride           = DimsVectorUtils::Count(round_input_dims, axis);
+            // concat 拼接时，若原始数据不足 4，则每次至少取 1
+            input_stride = std::max(1, input_stride);
             auto input_ptr = reinterpret_cast<T *>(GetBlobHandlePtr(input->GetHandle())) + n * input_stride;
             memcpy(output_ptr, input_ptr, input_stride * sizeof(T));
             output_ptr += input_stride;
@@ -492,14 +500,15 @@ Status ArmConcatLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std
     }
 
     if (inputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
-        return ExecNchw(inputs, outputs);
+        ExecNchw(inputs, outputs);
     } else if (inputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NC4HW4 ||
                inputs[0]->GetBlobDesc().data_format == DATA_FORMAT_NC8HW8) {
-        return Exec(inputs, outputs);
+        Exec(inputs, outputs);
     } else {
         return Status(TNNERR_LAYER_ERR, "Unsupported data format in concat");
     }
-    return TNNERR_LAYER_ERR;
+
+    return TNN_OK;
 }
 
 REGISTER_ARM_ACC(Concat, LAYER_CONCAT)

diff --git a/source/tnn/device/arm/acc/arm_concat_layer_acc.h b/source/tnn/device/arm/acc/arm_concat_layer_acc.h
@@ -24,6 +24,9 @@ class ArmConcatLayerAcc : public ArmLayerAcc {
 public:
     virtual ~ArmConcatLayerAcc();
 
+    template <typename T_IN, typename T_OUT>
+    Status TransDataType(void *data, const DimsVector &shapes, bool padding = false);
+
     Status ExecInt8(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
     Status ExecNchw(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
     Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);

diff --git a/source/tnn/device/arm/acc/arm_equal_layer_acc.cc b/source/tnn/device/arm/acc/arm_equal_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_BINARY_ACC(Equal);
+
+Status ArmEqualLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = ArmBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    op_type_ = ArmBinaryOpType::kEQUAL;
+
+    return TNN_OK;
+}
+
+ArmEqualLayerAcc::~ArmEqualLayerAcc() {}
+
+REGISTER_ARM_ACC(Equal, LAYER_EQUAL)
+REGISTER_ARM_PRECISION_FP16(LAYER_EQUAL)
+REGISTER_ARM_LAYOUT(LAYER_EQUAL, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/arm/acc/arm_gather_layer_acc.cc b/source/tnn/device/arm/acc/arm_gather_layer_acc.cc
@@ -59,7 +59,7 @@ Status ArmGatherLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std
 
     const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
     auto output_data_ptr = GetBlobHandlePtr(outputs[0]->GetHandle());
-
+    
     for (int b = 0; b < batch; b++) {
         int input_index_b = b * input_slice_count * slice_size;
         int output_index_b = b * output_slice_count * slice_size;
@@ -69,7 +69,7 @@ Status ArmGatherLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std
                 slice_index += input_slice_count;
             }
             if (slice_index < 0 || slice_index >= input_slice_count) {
-                LOGE("ArmGatherLayerAcc::Forward invalid slice_index\n");
+                LOGE("ArmGatherLayerAcc::Forward invalid slice_index %d %d\n", slice_index, (int)indices_data_ptr[i]);
                 return Status(TNNERR_MODEL_ERR, "ArmGatherLayerAcc::Forward invalid slice_index");
             }
             int input_index = input_index_b + slice_index * slice_size;

diff --git a/source/tnn/device/arm/acc/arm_greater_layer_acc.cc b/source/tnn/device/arm/acc/arm_greater_layer_acc.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_binary_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_BINARY_ACC(Greater);
+
+Status ArmGreaterLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                            const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = ArmBinaryLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    op_type_ = ArmBinaryOpType::kGREATER;
+
+    return TNN_OK;
+}
+
+ArmGreaterLayerAcc::~ArmGreaterLayerAcc() {}
+
+REGISTER_ARM_ACC(Greater, LAYER_GREATER)
+REGISTER_ARM_PRECISION_FP16(LAYER_GREATER)
+REGISTER_ARM_LAYOUT(LAYER_GREATER, DATA_FORMAT_NC4HW4)
+
+}  // namespace TNN_NS