Skip to content

Commit

Permalink
add tanh avx512 optimize (#3770)
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonZhang892 committed May 8, 2022
1 parent f9c1787 commit 663b42e
Showing 1 changed file with 11 additions and 57 deletions.
68 changes: 11 additions & 57 deletions src/layer/x86/tanh_x86.cpp
Expand Up @@ -31,65 +31,10 @@ int TanH_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int size = w * h;
#if __SSE2__
int elempack = bottom_top_blob.elempack;

#if __AVX__
#if __AVX512F__
if (elempack == 16)
{
Mat tmp;
convert_packing(bottom_top_blob, tmp, 8, opt);

forward_inplace(tmp, opt);

convert_packing(tmp, bottom_top_blob, 16, opt);

return 0;
}
#endif // __AVX512F__

if (elempack == 8)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

for (int i = 0; i < size; i++)
{
__m256 _p = _mm256_loadu_ps(ptr);
_p = tanh_avx(_p);
_mm256_storeu_ps(ptr, _p);
ptr += 8;
}
}

return 0;
}
#endif // __AVX__

if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

for (int i = 0; i < size; i++)
{
__m128 _p = _mm_loadu_ps(ptr);
_p = tanh_sse(_p);
_mm_storeu_ps(ptr, _p);
ptr += 4;
}
}

return 0;
}
#endif // __SSE2__
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
Expand All @@ -99,6 +44,15 @@ int TanH_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int i = 0;
#if __SSE2__
#if __AVX__
#if __AVX512F__
for (; i + 15 < size; i += 16)
{
__m512 _p = _mm512_loadu_ps(ptr);
_p = tanh_avx512(_p);
_mm512_storeu_ps(ptr, _p);
ptr += 16;
}
#endif
for (; i + 7 < size; i += 8)
{
__m256 _p = _mm256_loadu_ps(ptr);
Expand Down

0 comments on commit 663b42e

Please sign in to comment.