Skip to content

Commit

Permalink
Merge branch 'Tencent:master' into physical-cpu-count
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Oct 30, 2022
2 parents 73a8140 + 9c6f110 commit 4356346
Show file tree
Hide file tree
Showing 27 changed files with 2,291 additions and 3,960 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/pnnx.yml
Expand Up @@ -41,6 +41,9 @@ jobs:
- torch-version: 1.12.0
torchvision-version: 0.13.0

- torch-version: 1.13.0
torchvision-version: 0.14.0

steps:
- uses: actions/checkout@v3
with:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release-python.yml
Expand Up @@ -53,7 +53,7 @@ jobs:
brew uninstall --ignore-dependencies libomp
- name: Build wheels
uses: pypa/cibuildwheel@v2.11.1
uses: pypa/cibuildwheel@v2.11.2
env:
CIBW_ARCHS_MACOS: x86_64 universal2 arm64
CIBW_ARCHS_LINUX: x86_64 i686
Expand Down Expand Up @@ -103,7 +103,7 @@ jobs:
platforms: all

- name: Build wheels
uses: pypa/cibuildwheel@v2.11.1
uses: pypa/cibuildwheel@v2.11.2
env:
CIBW_ARCHS_LINUX: ${{ matrix.arch }}
CIBW_BUILD: ${{ matrix.build }}
Expand Down
6 changes: 2 additions & 4 deletions src/layer/arm/convolution_arm.cpp
Expand Up @@ -387,10 +387,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
// conv3x3s1_winograd63_transform_kernel_neon(weight_data, weight_winograd63_data, num_input, num_output, opt);
conv3x3s1_winograd63_transform_kernel_neon5(weight_data, weight_winograd63_data, num_input, num_output, opt);
}
else
{
weight_data_tm = weight_data;
}

weight_data_tm = weight_data;
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
Expand Down
38 changes: 2 additions & 36 deletions src/layer/x86/convolution_3x3_pack16to1.h
Expand Up @@ -290,7 +290,7 @@ static void conv3x3s1_winograd63_pack16to1_avx512(const Mat& bottom_blob, Mat& t
__m512 _re = _mm512_loadu_ps(r0 + 16 * 14);
__m512 _rf = _mm512_loadu_ps(r0 + 16 * 15);

transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);

_mm512_storeu_ps(tmpptr, _r0);
_mm512_storeu_ps(tmpptr + 16, _r1);
Expand Down Expand Up @@ -333,41 +333,7 @@ static void conv3x3s1_winograd63_pack16to1_avx512(const Mat& bottom_blob, Mat& t
__m512 _r6 = _mm512_load_ps(r0 + 16 * 6);
__m512 _r7 = _mm512_load_ps(r0 + 16 * 7);

__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);

__m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));

_tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
_tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
_tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
_tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
_tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
_tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));

_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
_r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);

_mm512_store_ps(tmpptr, _r0);
_mm512_store_ps(tmpptr + 16, _r1);
Expand Down
117 changes: 4 additions & 113 deletions src/layer/x86/convolution_sgemm_pack16.h
Expand Up @@ -67,57 +67,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob,
__m512 _ra = _mm512_loadu_ps(img0 + 16 * 10);
__m512 _rb = _mm512_loadu_ps(img0 + 16 * 11);

__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);
__m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9);
__m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9);
__m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb);
__m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb);

__m512 _tmpc = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpd = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpe = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpf = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpg = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmph = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpi = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpj = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpk = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpl = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpm = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpn = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2));

_tmp0 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0));
_tmp3 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(2, 0, 2, 0));
_tmp4 = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
_tmp5 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0));
_tmp6 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(3, 1, 3, 1));
_tmp7 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
_tmp8 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1));
_tmp9 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(3, 1, 3, 1));
_tmpa = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));
_tmpb = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1));

_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0));
_r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0));
_r6 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r7 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
_r8 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_r9 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
_ra = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1));
_rb = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1));
transpose16x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb);

_mm512_storeu_ps(tmpptr, _r0);
_mm512_storeu_ps(tmpptr + 16, _r1);
Expand Down Expand Up @@ -164,41 +114,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob,
__m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6);
__m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7);

__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);

__m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));

_tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
_tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
_tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
_tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
_tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
_tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));

_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
_r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);

_mm512_storeu_ps(tmpptr, _r0);
_mm512_storeu_ps(tmpptr + 16, _r1);
Expand Down Expand Up @@ -237,25 +153,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob,
__m512 _r2 = _mm512_loadu_ps(img0 + 16 * 2);
__m512 _r3 = _mm512_loadu_ps(img0 + 16 * 3);

__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);

__m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));

_tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));

_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
transpose16x4_ps(_r0, _r1, _r2, _r3);

_mm512_storeu_ps(tmpptr, _r0);
_mm512_storeu_ps(tmpptr + 16, _r1);
Expand Down Expand Up @@ -288,14 +186,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob,
__m512 _r0 = _mm512_loadu_ps(img0);
__m512 _r1 = _mm512_loadu_ps(img0 + 16);

__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);

__m512 _tmp2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
__m512 _tmp3 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));

_r0 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
transpose16x2_ps(_r0, _r1);

_mm512_storeu_ps(tmpptr, _r0);
_mm512_storeu_ps(tmpptr + 16, _r1);
Expand Down
38 changes: 2 additions & 36 deletions src/layer/x86/convolution_sgemm_pack16to1.h
Expand Up @@ -66,7 +66,7 @@ static void im2col_sgemm_pack16to1_avx512(const Mat& bottom_im2col, Mat& top_blo
__m512 _re = _mm512_loadu_ps(img0 + 16 * 14);
__m512 _rf = _mm512_loadu_ps(img0 + 16 * 15);

transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);
transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf);

_mm512_storeu_ps(tmpptr, _r0);
_mm512_storeu_ps(tmpptr + 16, _r1);
Expand Down Expand Up @@ -117,41 +117,7 @@ static void im2col_sgemm_pack16to1_avx512(const Mat& bottom_im2col, Mat& top_blo
__m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6);
__m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7);

__m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1);
__m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1);
__m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3);
__m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3);
__m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5);
__m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5);
__m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7);
__m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7);

__m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2));

_tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0));
_tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0));
_tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0));
_tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0));
_tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1));
_tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1));
_tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1));
_tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1));

_r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
_r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
_r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
_r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
_r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
_r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
_r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
_r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);

_mm512_storeu_ps(tmpptr, _r0);
_mm512_storeu_ps(tmpptr + 16, _r1);
Expand Down

0 comments on commit 4356346

Please sign in to comment.