promote vfpv4 for auto fp16 storage conversion (#5325)

* promote vfpv4 for auto fp16 storage conversion * always report neon and vfpv4 for arm64
Tencent · Feb 2, 2024 · 984d6dd · 984d6dd
1 parent 5b536af
commit 984d6dd
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 52 deletions.
diff --git a/src/cpu.cpp b/src/cpu.cpp
@@ -129,9 +129,6 @@ static ncnn::CpuSet g_cpu_affinity_mask_big;
 
 // isa info
 #if defined _WIN32
-#if __arm__
-static int g_cpu_support_arm_neon;
-static int g_cpu_support_arm_vfpv4;
 #if __aarch64__
 static int g_cpu_support_arm_asimdhp;
 static int g_cpu_support_arm_cpuid;
@@ -144,10 +141,11 @@ static int g_cpu_support_arm_sve2;
 static int g_cpu_support_arm_svebf16;
 static int g_cpu_support_arm_svei8mm;
 static int g_cpu_support_arm_svef32mm;
-#else  // __aarch64__
+#elif __arm__
 static int g_cpu_support_arm_edsp;
-#endif // __aarch64__
-#endif // __arm__
+static int g_cpu_support_arm_neon;
+static int g_cpu_support_arm_vfpv4;
+#endif // __aarch64__ || __arm__
 #elif defined __ANDROID__ || defined __linux__
 static unsigned int g_hwcaps;
 static unsigned int g_hwcaps2;
@@ -2040,9 +2038,6 @@ static void initialize_global_cpu_info()
     g_is_being_debugged = is_being_debugged();
 
 #if defined _WIN32
-#if __arm__
-    g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
-    g_cpu_support_arm_vfpv4 = detectisa(some_vfpv4);
 #if __aarch64__
     g_cpu_support_arm_cpuid = detectisa(some_cpuid);
     g_cpu_support_arm_asimdhp = detectisa(some_asimdhp) || IsProcessorFeaturePresent(43); // dp implies hp
@@ -2055,10 +2050,11 @@ static void initialize_global_cpu_info()
     g_cpu_support_arm_svebf16 = detectisa(some_svebf16);
     g_cpu_support_arm_svei8mm = detectisa(some_svei8mm);
     g_cpu_support_arm_svef32mm = detectisa(some_svef32mm);
-#else  // __aarch64__
+#elif __arm__
     g_cpu_support_arm_edsp = detectisa(some_edsp);
-#endif // __aarch64__
-#endif // __arm__
+    g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
+    g_cpu_support_arm_vfpv4 = detectisa(some_vfpv4);
+#endif // __aarch64__ || __arm__
 #elif defined __ANDROID__ || defined __linux__
     g_hwcaps = get_elf_hwcap(AT_HWCAP);
     g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
@@ -2271,21 +2267,15 @@ int cpu_support_arm_edsp()
 int cpu_support_arm_neon()
 {
     try_initialize_global_cpu_info();
-#if __arm__
+#if __aarch64__
+    return 1;
+#elif __arm__
 #if defined _WIN32
     return g_cpu_support_arm_neon;
 #elif defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps & HWCAP_ASIMD;
-#else
     return g_hwcaps & HWCAP_NEON;
-#endif
 #elif __APPLE__
-#if __aarch64__
-    return g_hw_cputype == CPU_TYPE_ARM64;
-#else
     return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
-#endif
 #else
     return 0;
 #endif
@@ -2297,22 +2287,15 @@ int cpu_support_arm_neon()
 int cpu_support_arm_vfpv4()
 {
     try_initialize_global_cpu_info();
-#if __arm__
+#if __aarch64__
+    return 1;
+#elif __arm__
 #if defined _WIN32
     return g_cpu_support_arm_vfpv4;
 #elif defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    // neon always enable fma and fp16
-    return g_hwcaps & HWCAP_ASIMD;
-#else
     return g_hwcaps & HWCAP_VFPv4;
-#endif
 #elif __APPLE__
-#if __aarch64__
-    return g_hw_cputype == CPU_TYPE_ARM64;
-#else
     return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
-#endif
 #else
     return 0;
 #endif

diff --git a/src/net.cpp b/src/net.cpp
@@ -621,15 +621,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         // clang-format off
         // *INDENT-OFF*
 
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && layer->support_fp16_storage)
+#if NCNN_VFPV4
+        if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && layer->support_fp16_storage)
         {
             Mat bottom_blob_fp16;
             cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
             bottom_blob = bottom_blob_fp16;
         }
         else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_RVV
         if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && layer->support_fp16_storage)
         {
@@ -731,15 +731,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         // clang-format off
         // *INDENT-OFF*
 
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && !layer->support_fp16_storage)
+#if NCNN_VFPV4
+        if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && !layer->support_fp16_storage)
         {
             Mat bottom_blob_fp32;
             cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
             bottom_blob = bottom_blob_fp32;
         }
         else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_RVV
         if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && !layer->support_fp16_storage)
         {
@@ -2691,8 +2691,8 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
 
     // clang-format off
     // *INDENT-OFF*
-#if NCNN_ARM82
-    if (d->opt.use_fp16_storage && cpu_support_arm_asimdhp() && (type == 0))
+#if NCNN_VFPV4
+    if (d->opt.use_fp16_storage && cpu_support_arm_vfpv4() && (type == 0))
     {
         if (feat.elembits() == 16)
         {
@@ -2702,7 +2702,7 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
         }
     }
     else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_BF16
     if (d->opt.use_bf16_storage && (type == 0))
     {

diff --git a/tests/testutil.cpp b/tests/testutil.cpp
@@ -446,13 +446,13 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
     {
         // clang-format off
         // *INDENT-OFF*
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+#if NCNN_VFPV4
+        if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
         {
             ncnn::cast_float32_to_float16(a[i], a4[i], opt);
         }
         else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_RVV
         if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
         {
@@ -571,15 +571,15 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
     {
         // clang-format off
         // *INDENT-OFF*
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c[i].elembits() == 16)
+#if NCNN_VFPV4
+        if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c[i].elembits() == 16)
         {
             ncnn::Mat c_fp32;
             ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
             c[i] = c_fp32;
         }
         else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_RVV
         if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c[i].elembits() == 16)
         {
@@ -961,13 +961,13 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 
     // clang-format off
     // *INDENT-OFF*
-#if NCNN_ARM82
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+#if NCNN_VFPV4
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
     {
         ncnn::cast_float32_to_float16(a, a4, opt);
     }
     else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_RVV
     if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
     {
@@ -1077,15 +1077,15 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 
     // clang-format off
     // *INDENT-OFF*
-#if NCNN_ARM82
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c.elembits() == 16)
+#if NCNN_VFPV4
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c.elembits() == 16)
     {
         ncnn::Mat c_fp32;
         ncnn::cast_float16_to_float32(c, c_fp32, opt);
         c = c_fp32;
     }
     else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_RVV
     if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c.elembits() == 16)
     {