Skip to content

Commit

Permalink
promote vfpv4 for auto fp16 storage conversion (#5325)
Browse files Browse the repository at this point in the history
* promote vfpv4 for auto fp16 storage conversion

* always report neon and vfpv4 for arm64
  • Loading branch information
nihui committed Feb 2, 2024
1 parent 5b536af commit 984d6dd
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 52 deletions.
45 changes: 14 additions & 31 deletions src/cpu.cpp
Expand Up @@ -129,9 +129,6 @@ static ncnn::CpuSet g_cpu_affinity_mask_big;

// isa info
#if defined _WIN32
#if __arm__
static int g_cpu_support_arm_neon;
static int g_cpu_support_arm_vfpv4;
#if __aarch64__
static int g_cpu_support_arm_asimdhp;
static int g_cpu_support_arm_cpuid;
Expand All @@ -144,10 +141,11 @@ static int g_cpu_support_arm_sve2;
static int g_cpu_support_arm_svebf16;
static int g_cpu_support_arm_svei8mm;
static int g_cpu_support_arm_svef32mm;
#else // __aarch64__
#elif __arm__
static int g_cpu_support_arm_edsp;
#endif // __aarch64__
#endif // __arm__
static int g_cpu_support_arm_neon;
static int g_cpu_support_arm_vfpv4;
#endif // __aarch64__ || __arm__
#elif defined __ANDROID__ || defined __linux__
static unsigned int g_hwcaps;
static unsigned int g_hwcaps2;
Expand Down Expand Up @@ -2040,9 +2038,6 @@ static void initialize_global_cpu_info()
g_is_being_debugged = is_being_debugged();

#if defined _WIN32
#if __arm__
g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
g_cpu_support_arm_vfpv4 = detectisa(some_vfpv4);
#if __aarch64__
g_cpu_support_arm_cpuid = detectisa(some_cpuid);
g_cpu_support_arm_asimdhp = detectisa(some_asimdhp) || IsProcessorFeaturePresent(43); // dp implies hp
Expand All @@ -2055,10 +2050,11 @@ static void initialize_global_cpu_info()
g_cpu_support_arm_svebf16 = detectisa(some_svebf16);
g_cpu_support_arm_svei8mm = detectisa(some_svei8mm);
g_cpu_support_arm_svef32mm = detectisa(some_svef32mm);
#else // __aarch64__
#elif __arm__
g_cpu_support_arm_edsp = detectisa(some_edsp);
#endif // __aarch64__
#endif // __arm__
g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
g_cpu_support_arm_vfpv4 = detectisa(some_vfpv4);
#endif // __aarch64__ || __arm__
#elif defined __ANDROID__ || defined __linux__
g_hwcaps = get_elf_hwcap(AT_HWCAP);
g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
Expand Down Expand Up @@ -2271,21 +2267,15 @@ int cpu_support_arm_edsp()
int cpu_support_arm_neon()
{
try_initialize_global_cpu_info();
#if __arm__
#if __aarch64__
return 1;
#elif __arm__
#if defined _WIN32
return g_cpu_support_arm_neon;
#elif defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_ASIMD;
#else
return g_hwcaps & HWCAP_NEON;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_cputype == CPU_TYPE_ARM64;
#else
return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
#endif
#else
return 0;
#endif
Expand All @@ -2297,22 +2287,15 @@ int cpu_support_arm_neon()
int cpu_support_arm_vfpv4()
{
try_initialize_global_cpu_info();
#if __arm__
#if __aarch64__
return 1;
#elif __arm__
#if defined _WIN32
return g_cpu_support_arm_vfpv4;
#elif defined __ANDROID__ || defined __linux__
#if __aarch64__
// neon always enable fma and fp16
return g_hwcaps & HWCAP_ASIMD;
#else
return g_hwcaps & HWCAP_VFPv4;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_cputype == CPU_TYPE_ARM64;
#else
return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
#endif
#else
return 0;
#endif
Expand Down
18 changes: 9 additions & 9 deletions src/net.cpp
Expand Up @@ -621,15 +621,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
// clang-format off
// *INDENT-OFF*

#if NCNN_ARM82
if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && layer->support_fp16_storage)
#if NCNN_VFPV4
if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && layer->support_fp16_storage)
{
Mat bottom_blob_fp16;
cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
bottom_blob = bottom_blob_fp16;
}
else
#endif // NCNN_ARM82
#endif // NCNN_VFPV4
#if NCNN_RVV
if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && layer->support_fp16_storage)
{
Expand Down Expand Up @@ -731,15 +731,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
// clang-format off
// *INDENT-OFF*

#if NCNN_ARM82
if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && !layer->support_fp16_storage)
#if NCNN_VFPV4
if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && !layer->support_fp16_storage)
{
Mat bottom_blob_fp32;
cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
bottom_blob = bottom_blob_fp32;
}
else
#endif // NCNN_ARM82
#endif // NCNN_VFPV4
#if NCNN_RVV
if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && !layer->support_fp16_storage)
{
Expand Down Expand Up @@ -2691,8 +2691,8 @@ int Extractor::extract(int blob_index, Mat& feat, int type)

// clang-format off
// *INDENT-OFF*
#if NCNN_ARM82
if (d->opt.use_fp16_storage && cpu_support_arm_asimdhp() && (type == 0))
#if NCNN_VFPV4
if (d->opt.use_fp16_storage && cpu_support_arm_vfpv4() && (type == 0))
{
if (feat.elembits() == 16)
{
Expand All @@ -2702,7 +2702,7 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
}
}
else
#endif // NCNN_ARM82
#endif // NCNN_VFPV4
#if NCNN_BF16
if (d->opt.use_bf16_storage && (type == 0))
{
Expand Down
24 changes: 12 additions & 12 deletions tests/testutil.cpp
Expand Up @@ -446,13 +446,13 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
{
// clang-format off
// *INDENT-OFF*
#if NCNN_ARM82
if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
#if NCNN_VFPV4
if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
{
ncnn::cast_float32_to_float16(a[i], a4[i], opt);
}
else
#endif // NCNN_ARM82
#endif // NCNN_VFPV4
#if NCNN_RVV
if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
{
Expand Down Expand Up @@ -571,15 +571,15 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
{
// clang-format off
// *INDENT-OFF*
#if NCNN_ARM82
if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c[i].elembits() == 16)
#if NCNN_VFPV4
if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c[i].elembits() == 16)
{
ncnn::Mat c_fp32;
ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
c[i] = c_fp32;
}
else
#endif // NCNN_ARM82
#endif // NCNN_VFPV4
#if NCNN_RVV
if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c[i].elembits() == 16)
{
Expand Down Expand Up @@ -961,13 +961,13 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n

// clang-format off
// *INDENT-OFF*
#if NCNN_ARM82
if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
#if NCNN_VFPV4
if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
{
ncnn::cast_float32_to_float16(a, a4, opt);
}
else
#endif // NCNN_ARM82
#endif // NCNN_VFPV4
#if NCNN_RVV
if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
{
Expand Down Expand Up @@ -1077,15 +1077,15 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n

// clang-format off
// *INDENT-OFF*
#if NCNN_ARM82
if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c.elembits() == 16)
#if NCNN_VFPV4
if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c.elembits() == 16)
{
ncnn::Mat c_fp32;
ncnn::cast_float16_to_float32(c, c_fp32, opt);
c = c_fp32;
}
else
#endif // NCNN_ARM82
#endif // NCNN_VFPV4
#if NCNN_RVV
if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c.elembits() == 16)
{
Expand Down

0 comments on commit 984d6dd

Please sign in to comment.